englishtokorea/naver_english.py
2025-05-28 14:26:49 +09:00

134 lines
4.2 KiB
Python

import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import json # JSON 라이브러리 가져오기
# MySQL 연결 설정
connection = pymysql.connect(
host="syye.net",
user="pythonUser",
password="Tjekdfl1324%^",
db="English_words",
charset='utf8mb4',
autocommit=True,
cursorclass=pymysql.cursors.DictCursor
)
# 단어 데이터를 MySQL에서 가져오기
def get_words():
with connection.cursor() as cursor:
sql = "SELECT pid, word FROM ew_word LIMIT 20"
cursor.execute(sql)
result = cursor.fetchall()
return result
# 데이터를 DB에 저장
def save_data(word_data, ew_word_pid):
with connection.cursor() as cursor:
for row in word_data["rows"]:
origin = row.get("origin", "")
unit_grade = row.get("unit_grade", "")
is_first_row = row.get("is_first_row", False)
for mean_item in row.get("mean_list", []):
mean_num = mean_item.get("num", "")
word_class = mean_item.get("word_class", "")
mean = mean_item.get("mean", "")
sql = """
INSERT INTO ew_word_details (ew_word_pid, origin, unit_grade, mean_num, word_class, mean, is_first_row)
VALUES (%s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(sql, (ew_word_pid, origin, unit_grade, mean_num, word_class, mean, is_first_row))
words = get_words()
# 셀레늄 웹드라이버 설정
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
def remove_text(original_string, text_to_remove):
result_string = original_string.replace(text_to_remove, "")
return result_string
def clean_text(text):
return text.replace('\t', '').replace('\n', '').strip()
def clean_num_text(text):
return text.replace('.', '').strip()
# 각 단어를 네이버 사전에서 조회
for word_entry in words:
pid = word_entry['pid']
word = word_entry['word']
url = f'https://en.dict.naver.com/#/search?query={word}'
driver.get(url)
# 페이지 로드 대기
time.sleep(3)
# 페이지 소스 가져오기
html = driver.page_source
# BeautifulSoup을 사용하여 HTML 파싱
soup = BeautifulSoup(html, 'html.parser')
word_data = {
"word": word,
"rows": []
}
first_component = soup.select_one('.component_keyword')
if first_component:
rows = first_component.select('.row')
for i, row in enumerate(rows):
row_data = {}
origin = row.select_one('.origin a')
if origin:
row_data['origin'] = clean_text(origin.get_text(strip=True))
unit_grade = row.select_one('.unit_grade .star_grade')
if unit_grade:
row_data['unit_grade'] = unit_grade.get('aria-label', '').strip()
mean_list = row.select('.mean_list li')
if mean_list:
mean_data = []
for li in mean_list:
word_class = li.select_one('span.word_class')
mean = li.select_one('p.mean')
word_class_text = word_class.get_text(strip=True) if word_class else ''
mean_text = mean.get_text(strip=True) if mean else ''
mean_r_text = remove_text(mean_text, word_class_text)
mean_item = {
"word_class": word_class_text,
"mean": mean_r_text
}
if i == 0: # 첫 번째 row
num = li.select_one('span.num')
num_text = clean_num_text(num.get_text(strip=True)) if num else ''
mean_item["num"] = num_text
mean_data.append(mean_item)
row_data['mean_list'] = mean_data
row_data['is_first_row'] = (i == 0) # 첫 번째 row 여부 설정
word_data['rows'].append(row_data)
# 결과 출력 및 저장
print(json.dumps(word_data, ensure_ascii=False, indent=4))
save_data(word_data, pid)
driver.quit()