HiPhone_BE/crawl/joongna.py

104 lines
3.0 KiB
Python
Raw Permalink Normal View History

2023-11-01 19:13:24 +09:00
import requests
import re
import time
2023-11-13 19:52:35 +09:00
import json
2023-11-01 19:13:24 +09:00
# from bs4 import BeautifulSoup
from datetime import datetime
def get_api_id():
base_url = "https://web.joongna.com/"
response = requests.get(base_url)
text = response.text
pattern = r"_next/static/chunks/pages/_app.*?\.js"
js_url = base_url + re.findall(pattern, text)[0]
response = requests.get(js_url)
text = response.text
2023-11-13 19:52:35 +09:00
index = text.find('iO.SENTRY_RELEASE={id:"') + 23
index_length = text[index:].find('"')
id = text[index : index + index_length]
2023-11-01 19:13:24 +09:00
return id
def get_url(api_id, keyword, page=1):
base = f"https://web.joongna.com/_next/data/{api_id}/search"
2023-11-13 19:52:35 +09:00
return f"{base}/{keyword}.json?page={page}&sort=RECENT_SORT&keyword={keyword}"
2023-11-01 19:13:24 +09:00
def search_joongna(api_id, keyword, page):
url = get_url(api_id, keyword, page)
response = requests.get(url)
data = response.json()
result = []
try:
queries = data["pageProps"]["dehydratedState"]["queries"]
if len(queries) == 0:
return False
2023-11-13 19:52:35 +09:00
items = data["pageProps"]["dehydratedState"]["queries"][0]["state"]["data"][
2023-11-01 19:13:24 +09:00
"data"
2023-11-13 19:52:35 +09:00
]["items"]
2023-11-01 19:13:24 +09:00
item_length = len(items)
if item_length == 0:
return False
now = datetime.now()
target = datetime(now.year, now.month, 1)
last_item_date = datetime.strptime(
items[item_length - 1]["sortDate"], "%Y-%m-%d %H:%M:%S"
)
if (target - last_item_date).days > 30:
return False
for item in items:
result.append(
{
"title": item["title"],
"price": item["price"],
"year": item["sortDate"].split("-")[0],
"month": item["sortDate"].split("-")[1],
"day": item["sortDate"].split("-")[2].split(" ")[0],
}
)
except Exception:
print("--------------------------------------------")
print(url)
print(data["pageProps"]["dehydratedState"]["queries"])
print("--------------------------------------------")
finally:
return result
def get_joongna(keyword):
api_id = get_api_id()
result = []
page = 1
while True:
2023-11-13 19:52:35 +09:00
print(f"j {keyword} p{page}")
2023-11-01 19:13:24 +09:00
page_result = search_joongna(api_id, keyword, page)
if not page_result:
break
2023-11-13 19:52:35 +09:00
filtered_result = []
for item in page_result:
if not (
"매입" in item["title"]
or "삽니다" in item["title"]
or "사요" in item["title"]
or "케이스" in item["title"]
or item["price"] % 10 != 0
or item["price"] < 100000
or item["price"] > 2000000
):
filtered_result.append(item)
result += filtered_result
2023-11-01 19:13:24 +09:00
page += 1
time.sleep(0.1)
2023-11-13 19:52:35 +09:00
sum = 0
for item in result:
sum += item["price"]
if len(result) == 0:
return 0
avg = round(sum // len(result), -3)
return avg