130 lines
4.6 KiB
Python
130 lines
4.6 KiB
Python
import pymysql
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import random
|
|
import time
|
|
import json
|
|
|
|
|
|
def fetch_random_words(connection, limit=1):
|
|
with connection.cursor() as cursor:
|
|
sql = "SELECT word FROM ew_word ORDER BY RAND() LIMIT %s"
|
|
cursor.execute(sql, (limit,))
|
|
result = cursor.fetchall()
|
|
return [row['word'] for row in result]
|
|
|
|
|
|
def fetch_search_results(search_term, session):
|
|
# URL 설정
|
|
base_url = 'https://dic.daum.net/search.do?q='
|
|
url = f"{base_url}{search_term}"
|
|
|
|
# HTTP GET 요청을 보내고 응답을 가져옴
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1"
|
|
}
|
|
response = session.get(url, headers=headers)
|
|
response.raise_for_status()
|
|
|
|
return response.text
|
|
|
|
|
|
def parse_results(html):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
results = {}
|
|
|
|
# <span class="txt_emph1"> 값 추출
|
|
emph_span = soup.find('span', class_='txt_emph1')
|
|
results['txt_emph1'] = emph_span.get_text(strip=True) if emph_span else 'N/A'
|
|
|
|
# <ul class="list_search"> 텍스트 내용 추출 및 JSON 변환
|
|
list_search = soup.find('ul', class_='list_search')
|
|
if list_search:
|
|
list_items = list_search.find_all('li')
|
|
list_search_data = [li.get_text(strip=True) for li in list_items]
|
|
results['list_search'] = json.dumps(list_search_data, ensure_ascii=False)
|
|
else:
|
|
results['list_search'] = 'N/A'
|
|
|
|
# <div class="wrap_listen"> 텍스트 내용 추출 및 "듣기" 이후 텍스트 제거
|
|
wrap_listen = soup.find('div', class_='wrap_listen')
|
|
if wrap_listen:
|
|
wrap_listen_text = wrap_listen.get_text(strip=True)
|
|
if "듣기" in wrap_listen_text:
|
|
wrap_listen_text = wrap_listen_text.split("듣기")[0]
|
|
results['wrap_listen'] = wrap_listen_text
|
|
else:
|
|
results['wrap_listen'] = 'N/A'
|
|
|
|
# <div name="searchWords" class="search_box" data-initamount="3"> 다차원 JSON 변환
|
|
search_box = soup.find('div', {'name': 'searchWords', 'class': 'search_box', 'data-initamount': '3'})
|
|
if search_box:
|
|
search_items = search_box.find_all('div', class_='searchItem')
|
|
search_box_data = []
|
|
for item in search_items:
|
|
search_word = item.find('a', class_='search_word')
|
|
list_search_item = item.find('ul', class_='list_search')
|
|
wrap_listen_item = item.find('div', class_='wrap_listen')
|
|
|
|
search_box_data.append({
|
|
'search_word': search_word.get_text(strip=True) if search_word else 'N/A',
|
|
'list_search': [li.get_text(strip=True) for li in list_search_item.find_all('li')] if list_search_item else 'N/A',
|
|
'wrap_listen': wrap_listen_item.get_text(strip=True).split("듣기")[0] if wrap_listen_item and "듣기" in wrap_listen_item.get_text(strip=True) else (wrap_listen_item.get_text(strip=True) if wrap_listen_item else 'N/A')
|
|
})
|
|
results['search_box'] = json.dumps(search_box_data, ensure_ascii=False)
|
|
else:
|
|
results['search_box'] = 'N/A'
|
|
|
|
return results
|
|
|
|
|
|
def display_results(word, results):
|
|
print(f"\nResults for '{word}':")
|
|
print(f"1. txt_emph1: {results['txt_emph1']}")
|
|
print(f"2. list_search: {results['list_search']}")
|
|
print(f"3. wrap_listen: {results['wrap_listen']}")
|
|
print(f"4. search_box: {results['search_box']}")
|
|
|
|
|
|
def main():
|
|
# MySQL 연결 설정
|
|
connection = pymysql.connect(
|
|
host="syye.net",
|
|
user="pythonUser",
|
|
password="Tjekdfl1324%^",
|
|
db="English_words",
|
|
charset='utf8mb4',
|
|
autocommit=True,
|
|
cursorclass=pymysql.cursors.DictCursor
|
|
)
|
|
|
|
session = requests.Session()
|
|
|
|
try:
|
|
# 무작위로 단어 가져오기
|
|
words = fetch_random_words(connection)
|
|
print(f"Fetched words: {words}")
|
|
|
|
# 각 단어에 대해 검색 및 결과 표시
|
|
for word in words:
|
|
print(f"\nSearching for '{word}'...")
|
|
html = fetch_search_results(word, session)
|
|
results = parse_results(html)
|
|
display_results(word, results)
|
|
|
|
# 랜덤 대기 시간 추가
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
finally:
|
|
connection.close()
|
|
print("Database connection closed")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|