englishtokorea/daum_english.py
2025-05-28 14:26:49 +09:00

130 lines
4.6 KiB
Python

import pymysql
import requests
from bs4 import BeautifulSoup
import random
import time
import json
def fetch_random_words(connection, limit=1):
with connection.cursor() as cursor:
sql = "SELECT word FROM ew_word ORDER BY RAND() LIMIT %s"
cursor.execute(sql, (limit,))
result = cursor.fetchall()
return [row['word'] for row in result]
def fetch_search_results(search_term, session):
# URL 설정
base_url = 'https://dic.daum.net/search.do?q='
url = f"{base_url}{search_term}"
# HTTP GET 요청을 보내고 응답을 가져옴
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
response = session.get(url, headers=headers)
response.raise_for_status()
return response.text
def parse_results(html):
soup = BeautifulSoup(html, 'html.parser')
results = {}
# <span class="txt_emph1"> 값 추출
emph_span = soup.find('span', class_='txt_emph1')
results['txt_emph1'] = emph_span.get_text(strip=True) if emph_span else 'N/A'
# <ul class="list_search"> 텍스트 내용 추출 및 JSON 변환
list_search = soup.find('ul', class_='list_search')
if list_search:
list_items = list_search.find_all('li')
list_search_data = [li.get_text(strip=True) for li in list_items]
results['list_search'] = json.dumps(list_search_data, ensure_ascii=False)
else:
results['list_search'] = 'N/A'
# <div class="wrap_listen"> 텍스트 내용 추출 및 "듣기" 이후 텍스트 제거
wrap_listen = soup.find('div', class_='wrap_listen')
if wrap_listen:
wrap_listen_text = wrap_listen.get_text(strip=True)
if "듣기" in wrap_listen_text:
wrap_listen_text = wrap_listen_text.split("듣기")[0]
results['wrap_listen'] = wrap_listen_text
else:
results['wrap_listen'] = 'N/A'
# <div name="searchWords" class="search_box" data-initamount="3"> 다차원 JSON 변환
search_box = soup.find('div', {'name': 'searchWords', 'class': 'search_box', 'data-initamount': '3'})
if search_box:
search_items = search_box.find_all('div', class_='searchItem')
search_box_data = []
for item in search_items:
search_word = item.find('a', class_='search_word')
list_search_item = item.find('ul', class_='list_search')
wrap_listen_item = item.find('div', class_='wrap_listen')
search_box_data.append({
'search_word': search_word.get_text(strip=True) if search_word else 'N/A',
'list_search': [li.get_text(strip=True) for li in list_search_item.find_all('li')] if list_search_item else 'N/A',
'wrap_listen': wrap_listen_item.get_text(strip=True).split("듣기")[0] if wrap_listen_item and "듣기" in wrap_listen_item.get_text(strip=True) else (wrap_listen_item.get_text(strip=True) if wrap_listen_item else 'N/A')
})
results['search_box'] = json.dumps(search_box_data, ensure_ascii=False)
else:
results['search_box'] = 'N/A'
return results
def display_results(word, results):
print(f"\nResults for '{word}':")
print(f"1. txt_emph1: {results['txt_emph1']}")
print(f"2. list_search: {results['list_search']}")
print(f"3. wrap_listen: {results['wrap_listen']}")
print(f"4. search_box: {results['search_box']}")
def main():
# MySQL 연결 설정
connection = pymysql.connect(
host="syye.net",
user="pythonUser",
password="Tjekdfl1324%^",
db="English_words",
charset='utf8mb4',
autocommit=True,
cursorclass=pymysql.cursors.DictCursor
)
session = requests.Session()
try:
# 무작위로 단어 가져오기
words = fetch_random_words(connection)
print(f"Fetched words: {words}")
# 각 단어에 대해 검색 및 결과 표시
for word in words:
print(f"\nSearching for '{word}'...")
html = fetch_search_results(word, session)
results = parse_results(html)
display_results(word, results)
# 랜덤 대기 시간 추가
time.sleep(random.uniform(1, 3))
finally:
connection.close()
print("Database connection closed")
if __name__ == '__main__':
main()