134 lines
4.2 KiB
Python
134 lines
4.2 KiB
Python
import pymysql
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.service import Service
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
import json # JSON 라이브러리 가져오기
|
|
|
|
# MySQL 연결 설정
|
|
connection = pymysql.connect(
|
|
host="syye.net",
|
|
user="pythonUser",
|
|
password="Tjekdfl1324%^",
|
|
db="English_words",
|
|
charset='utf8mb4',
|
|
autocommit=True,
|
|
cursorclass=pymysql.cursors.DictCursor
|
|
)
|
|
|
|
|
|
# 단어 데이터를 MySQL에서 가져오기
|
|
def get_words():
|
|
with connection.cursor() as cursor:
|
|
sql = "SELECT pid, word FROM ew_word LIMIT 20"
|
|
cursor.execute(sql)
|
|
result = cursor.fetchall()
|
|
return result
|
|
|
|
|
|
# 데이터를 DB에 저장
|
|
def save_data(word_data, ew_word_pid):
|
|
with connection.cursor() as cursor:
|
|
for row in word_data["rows"]:
|
|
origin = row.get("origin", "")
|
|
unit_grade = row.get("unit_grade", "")
|
|
is_first_row = row.get("is_first_row", False)
|
|
for mean_item in row.get("mean_list", []):
|
|
mean_num = mean_item.get("num", "")
|
|
word_class = mean_item.get("word_class", "")
|
|
mean = mean_item.get("mean", "")
|
|
sql = """
|
|
INSERT INTO ew_word_details (ew_word_pid, origin, unit_grade, mean_num, word_class, mean, is_first_row)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
"""
|
|
cursor.execute(sql, (ew_word_pid, origin, unit_grade, mean_num, word_class, mean, is_first_row))
|
|
|
|
|
|
words = get_words()
|
|
|
|
# 셀레늄 웹드라이버 설정
|
|
service = Service(ChromeDriverManager().install())
|
|
driver = webdriver.Chrome(service=service)
|
|
|
|
|
|
def remove_text(original_string, text_to_remove):
|
|
result_string = original_string.replace(text_to_remove, "")
|
|
return result_string
|
|
|
|
|
|
def clean_text(text):
|
|
return text.replace('\t', '').replace('\n', '').strip()
|
|
|
|
|
|
def clean_num_text(text):
|
|
return text.replace('.', '').strip()
|
|
|
|
|
|
# 각 단어를 네이버 사전에서 조회
|
|
for word_entry in words:
|
|
pid = word_entry['pid']
|
|
word = word_entry['word']
|
|
url = f'https://en.dict.naver.com/#/search?query={word}'
|
|
driver.get(url)
|
|
|
|
# 페이지 로드 대기
|
|
time.sleep(3)
|
|
|
|
# 페이지 소스 가져오기
|
|
html = driver.page_source
|
|
|
|
# BeautifulSoup을 사용하여 HTML 파싱
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
word_data = {
|
|
"word": word,
|
|
"rows": []
|
|
}
|
|
|
|
first_component = soup.select_one('.component_keyword')
|
|
if first_component:
|
|
rows = first_component.select('.row')
|
|
for i, row in enumerate(rows):
|
|
row_data = {}
|
|
origin = row.select_one('.origin a')
|
|
if origin:
|
|
row_data['origin'] = clean_text(origin.get_text(strip=True))
|
|
|
|
unit_grade = row.select_one('.unit_grade .star_grade')
|
|
if unit_grade:
|
|
row_data['unit_grade'] = unit_grade.get('aria-label', '').strip()
|
|
|
|
mean_list = row.select('.mean_list li')
|
|
if mean_list:
|
|
mean_data = []
|
|
for li in mean_list:
|
|
word_class = li.select_one('span.word_class')
|
|
mean = li.select_one('p.mean')
|
|
|
|
word_class_text = word_class.get_text(strip=True) if word_class else ''
|
|
mean_text = mean.get_text(strip=True) if mean else ''
|
|
mean_r_text = remove_text(mean_text, word_class_text)
|
|
|
|
mean_item = {
|
|
"word_class": word_class_text,
|
|
"mean": mean_r_text
|
|
}
|
|
|
|
if i == 0: # 첫 번째 row
|
|
num = li.select_one('span.num')
|
|
num_text = clean_num_text(num.get_text(strip=True)) if num else ''
|
|
mean_item["num"] = num_text
|
|
|
|
mean_data.append(mean_item)
|
|
row_data['mean_list'] = mean_data
|
|
row_data['is_first_row'] = (i == 0) # 첫 번째 row 여부 설정
|
|
|
|
word_data['rows'].append(row_data)
|
|
|
|
# 결과 출력 및 저장
|
|
print(json.dumps(word_data, ensure_ascii=False, indent=4))
|
|
save_data(word_data, pid)
|
|
|
|
driver.quit()
|