diff --git a/crawl/__init__.py b/crawl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawl/admin.py b/crawl/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/crawl/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/crawl/apps.py b/crawl/apps.py new file mode 100644 index 0000000..4585bca --- /dev/null +++ b/crawl/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class CrawlConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "crawl" diff --git a/crawl/bunjang.py b/crawl/bunjang.py new file mode 100644 index 0000000..46d8e68 --- /dev/null +++ b/crawl/bunjang.py @@ -0,0 +1,69 @@ +import requests +import time + +# import re +import json +from datetime import datetime + + +def get_time(): + # return current time + # if current date is 2020 Nov 1st, and current time is 13:05:23 return 20201101130523 + now = datetime.now() + return now.strftime("%Y%m%d%H%M%S") + + +def search_bunjang(keyword, page=1): + time = get_time() + base_url = "https://api.bunjang.co.kr/api/1/find_v2.json" + url = f"{base_url}?q={keyword}&order=date&page={page-1}&request_id={time}&stat_device=w&n=100&stat_category_required=1&req_ref=search&version=5" + response = requests.get(url) + data = response.json() + result = [] + try: + items = data["list"] + item_length = len(items) + if item_length == 0: + return False + now = datetime.now() + target = datetime(now.year, now.month, 1) + last_item_date = datetime.utcfromtimestamp( + int(items[item_length - 1]["update_time"]) + ) + if (target - last_item_date).days > 30: + return False + for item in items: + update_time = datetime.utcfromtimestamp(int(item["update_time"])) + result.append( + { + "title": item["name"], + "price": item["price"], + "year": update_time.year, + "month": update_time.month, + "day": update_time.day, + } + ) + except Exception: + print("--------------------------------------------") + print(url) + print(data) + print("--------------------------------------------") + finally: + return result + + +def get_bunjang(keyword): + result = [] + page = 1 + while True: + print(f"page: {page}") + page_result = search_bunjang(keyword, page) + if not page_result: + break + result += page_result + page += 1 + time.sleep(0.1) + + # with open("bunjang.json", "w", encoding="utf-8") as file: + # json.dump(result, file, ensure_ascii=False, indent=2) + return result diff --git a/crawl/joongna.py b/crawl/joongna.py new file mode 100644 index 0000000..9b9d3e2 --- /dev/null +++ b/crawl/joongna.py @@ -0,0 +1,86 @@ +import requests +import re +import time + +# from bs4 import BeautifulSoup + +from datetime import datetime + + +def get_api_id(): + base_url = "https://web.joongna.com/" + response = requests.get(base_url) + text = response.text + pattern = r"_next/static/chunks/pages/_app.*?\.js" + js_url = base_url + re.findall(pattern, text)[0] + response = requests.get(js_url) + text = response.text + index = text.find('iO.SENTRY_RELEASE={id:"') + 24 + id = text[index : index + 20] + return id + + +def get_url(api_id, keyword, page=1): + base = f"https://web.joongna.com/_next/data/{api_id}/search" + return ( + f"{base}/{keyword}.json?page={page}&sort=RECENT_SORT&keyword={keyword}" + ) + + +def search_joongna(api_id, keyword, page): + url = get_url(api_id, keyword, page) + response = requests.get(url) + data = response.json() + result = [] + try: + queries = data["pageProps"]["dehydratedState"]["queries"] + if len(queries) == 0: + return False + items = data["pageProps"]["dehydratedState"]["queries"][0]["state"][ + "data" + ]["data"]["items"] + item_length = len(items) + if item_length == 0: + return False + now = datetime.now() + target = datetime(now.year, now.month, 1) + last_item_date = datetime.strptime( + items[item_length - 1]["sortDate"], "%Y-%m-%d %H:%M:%S" + ) + if (target - last_item_date).days > 30: + return False + for item in items: + result.append( + { + "title": item["title"], + "price": item["price"], + "year": item["sortDate"].split("-")[0], + "month": item["sortDate"].split("-")[1], + "day": item["sortDate"].split("-")[2].split(" ")[0], + } + ) + except Exception: + print("--------------------------------------------") + print(url) + print(data["pageProps"]["dehydratedState"]["queries"]) + print("--------------------------------------------") + finally: + return result + + +def get_joongna(keyword): + api_id = get_api_id() + result = [] + page = 1 + while True: + print(f"page: {page}") + page_result = search_joongna(api_id, keyword, page) + if not page_result: + break + result += page_result + page += 1 + time.sleep(0.1) + + # with open("joongna.json", "w", encoding="utf-8") as file: + # json.dump(result, file, ensure_ascii=False, indent=2) + return result diff --git a/crawl/migrations/__init__.py b/crawl/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawl/models.py b/crawl/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/crawl/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/crawl/tests.py b/crawl/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/crawl/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here.