englishtokorea/search_daum_dict.py
2025-05-28 14:26:49 +09:00

113 lines
3.6 KiB
Python

import pymysql
import requests
from bs4 import BeautifulSoup
import random
import time
def fetch_random_words(connection, limit=10):
with connection.cursor() as cursor:
sql = "SELECT word FROM ew_word ORDER BY RAND() LIMIT %s"
cursor.execute(sql, (limit,))
result = cursor.fetchall()
return [row['word'] for row in result]
def fetch_search_results(search_term, session):
# URL 설정
base_url = 'https://dic.daum.net/search.do?q='
url = f"{base_url}{search_term}"
# HTTP GET 요청을 보내고 응답을 가져옴
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
response = session.get(url, headers=headers)
response.raise_for_status()
return response.text
def parse_results(html):
soup = BeautifulSoup(html, 'html.parser')
results = {}
# <span class="txt_emph1"> 값 추출
emph_span = soup.find('span', class_='txt_emph1')
results['txt_emph1'] = emph_span.get_text(strip=True) if emph_span else 'N/A'
# <ul class="list_search"> 텍스트 내용 추출
list_search = soup.find('ul', class_='list_search')
results['list_search'] = list_search.get_text(strip=True) if list_search else 'N/A'
# <div class="wrap_listen"> 텍스트 내용 추출 및 "듣기" 이후 텍스트 제거
wrap_listen = soup.find('div', class_='wrap_listen')
if wrap_listen:
wrap_listen_text = wrap_listen.get_text(strip=True)
if "듣기" in wrap_listen_text:
wrap_listen_text = wrap_listen_text.split("듣기")[0]
results['wrap_listen'] = wrap_listen_text
else:
results['wrap_listen'] = 'N/A'
# <div name="searchWords" class="search_box" data-initamount="3"> HTML 내용 추출
search_box = soup.find('div', {'name': 'searchWords', 'class': 'search_box', 'data-initamount': '3'})
if search_box:
search_box_html = str(search_box)
results['search_box'] = search_box_html
else:
results['search_box'] = 'N/A'
return results
def display_results(word, results):
print(f"\nResults for '{word}':")
print(f"1. txt_emph1: {results['txt_emph1']}")
print(f"2. list_search: {results['list_search']}")
print(f"3. wrap_listen: {results['wrap_listen']}")
print(f"4. search_box: {results['search_box']}")
def main():
# MySQL 연결 설정
connection = pymysql.connect(
host="syye.net",
user="pythonUser",
password="Tjekdfl1324%^",
db="English_words",
charset='utf8mb4',
autocommit=True,
cursorclass=pymysql.cursors.DictCursor
)
session = requests.Session()
try:
# 무작위로 10개의 단어 가져오기
words = fetch_random_words(connection)
print(f"Fetched words: {words}")
# 각 단어에 대해 검색 및 결과 표시
for word in words:
print(f"\nSearching for '{word}'...")
html = fetch_search_results(word, session)
results = parse_results(html)
display_results(word, results)
# 랜덤 대기 시간 추가
time.sleep(random.uniform(1, 3))
finally:
connection.close()
print("Database connection closed")
if __name__ == '__main__':
main()