feat: Add crawl app

This commit is contained in:
jhynsoo 2023-11-01 19:13:24 +09:00
parent 3cd7228c05
commit f7f9486237
8 changed files with 170 additions and 0 deletions

0
crawl/__init__.py Normal file
View File

3
crawl/admin.py Normal file
View File

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

6
crawl/apps.py Normal file
View File

@ -0,0 +1,6 @@
from django.apps import AppConfig
class CrawlConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "crawl"

69
crawl/bunjang.py Normal file
View File

@ -0,0 +1,69 @@
import requests
import time
# import re
import json
from datetime import datetime
def get_time():
# return current time
# if current date is 2020 Nov 1st, and current time is 13:05:23 return 20201101130523
now = datetime.now()
return now.strftime("%Y%m%d%H%M%S")
def search_bunjang(keyword, page=1):
time = get_time()
base_url = "https://api.bunjang.co.kr/api/1/find_v2.json"
url = f"{base_url}?q={keyword}&order=date&page={page-1}&request_id={time}&stat_device=w&n=100&stat_category_required=1&req_ref=search&version=5"
response = requests.get(url)
data = response.json()
result = []
try:
items = data["list"]
item_length = len(items)
if item_length == 0:
return False
now = datetime.now()
target = datetime(now.year, now.month, 1)
last_item_date = datetime.utcfromtimestamp(
int(items[item_length - 1]["update_time"])
)
if (target - last_item_date).days > 30:
return False
for item in items:
update_time = datetime.utcfromtimestamp(int(item["update_time"]))
result.append(
{
"title": item["name"],
"price": item["price"],
"year": update_time.year,
"month": update_time.month,
"day": update_time.day,
}
)
except Exception:
print("--------------------------------------------")
print(url)
print(data)
print("--------------------------------------------")
finally:
return result
def get_bunjang(keyword):
result = []
page = 1
while True:
print(f"page: {page}")
page_result = search_bunjang(keyword, page)
if not page_result:
break
result += page_result
page += 1
time.sleep(0.1)
# with open("bunjang.json", "w", encoding="utf-8") as file:
# json.dump(result, file, ensure_ascii=False, indent=2)
return result

86
crawl/joongna.py Normal file
View File

@ -0,0 +1,86 @@
import requests
import re
import time
# from bs4 import BeautifulSoup
from datetime import datetime
def get_api_id():
base_url = "https://web.joongna.com/"
response = requests.get(base_url)
text = response.text
pattern = r"_next/static/chunks/pages/_app.*?\.js"
js_url = base_url + re.findall(pattern, text)[0]
response = requests.get(js_url)
text = response.text
index = text.find('iO.SENTRY_RELEASE={id:"') + 24
id = text[index : index + 20]
return id
def get_url(api_id, keyword, page=1):
base = f"https://web.joongna.com/_next/data/{api_id}/search"
return (
f"{base}/{keyword}.json?page={page}&sort=RECENT_SORT&keyword={keyword}"
)
def search_joongna(api_id, keyword, page):
url = get_url(api_id, keyword, page)
response = requests.get(url)
data = response.json()
result = []
try:
queries = data["pageProps"]["dehydratedState"]["queries"]
if len(queries) == 0:
return False
items = data["pageProps"]["dehydratedState"]["queries"][0]["state"][
"data"
]["data"]["items"]
item_length = len(items)
if item_length == 0:
return False
now = datetime.now()
target = datetime(now.year, now.month, 1)
last_item_date = datetime.strptime(
items[item_length - 1]["sortDate"], "%Y-%m-%d %H:%M:%S"
)
if (target - last_item_date).days > 30:
return False
for item in items:
result.append(
{
"title": item["title"],
"price": item["price"],
"year": item["sortDate"].split("-")[0],
"month": item["sortDate"].split("-")[1],
"day": item["sortDate"].split("-")[2].split(" ")[0],
}
)
except Exception:
print("--------------------------------------------")
print(url)
print(data["pageProps"]["dehydratedState"]["queries"])
print("--------------------------------------------")
finally:
return result
def get_joongna(keyword):
api_id = get_api_id()
result = []
page = 1
while True:
print(f"page: {page}")
page_result = search_joongna(api_id, keyword, page)
if not page_result:
break
result += page_result
page += 1
time.sleep(0.1)
# with open("joongna.json", "w", encoding="utf-8") as file:
# json.dump(result, file, ensure_ascii=False, indent=2)
return result

View File

3
crawl/models.py Normal file
View File

@ -0,0 +1,3 @@
from django.db import models
# Create your models here.

3
crawl/tests.py Normal file
View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.