import pymysql from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup import time import json # JSON 라이브러리 가져오기 # MySQL 연결 설정 connection = pymysql.connect( host="syye.net", user="pythonUser", password="Tjekdfl1324%^", db="English_words", charset='utf8mb4', autocommit=True, cursorclass=pymysql.cursors.DictCursor ) # 단어 데이터를 MySQL에서 가져오기 def get_words(): with connection.cursor() as cursor: sql = "SELECT pid, word FROM ew_word LIMIT 20" cursor.execute(sql) result = cursor.fetchall() return result # 데이터를 DB에 저장 def save_data(word_data, ew_word_pid): with connection.cursor() as cursor: for row in word_data["rows"]: origin = row.get("origin", "") unit_grade = row.get("unit_grade", "") is_first_row = row.get("is_first_row", False) for mean_item in row.get("mean_list", []): mean_num = mean_item.get("num", "") word_class = mean_item.get("word_class", "") mean = mean_item.get("mean", "") sql = """ INSERT INTO ew_word_details (ew_word_pid, origin, unit_grade, mean_num, word_class, mean, is_first_row) VALUES (%s, %s, %s, %s, %s, %s, %s) """ cursor.execute(sql, (ew_word_pid, origin, unit_grade, mean_num, word_class, mean, is_first_row)) words = get_words() # 셀레늄 웹드라이버 설정 service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service) def remove_text(original_string, text_to_remove): result_string = original_string.replace(text_to_remove, "") return result_string def clean_text(text): return text.replace('\t', '').replace('\n', '').strip() def clean_num_text(text): return text.replace('.', '').strip() # 각 단어를 네이버 사전에서 조회 for word_entry in words: pid = word_entry['pid'] word = word_entry['word'] url = f'https://en.dict.naver.com/#/search?query={word}' driver.get(url) # 페이지 로드 대기 time.sleep(3) # 페이지 소스 가져오기 html = driver.page_source # BeautifulSoup을 사용하여 HTML 파싱 soup = BeautifulSoup(html, 'html.parser') word_data = { "word": word, "rows": [] } first_component = soup.select_one('.component_keyword') if first_component: rows = first_component.select('.row') for i, row in enumerate(rows): row_data = {} origin = row.select_one('.origin a') if origin: row_data['origin'] = clean_text(origin.get_text(strip=True)) unit_grade = row.select_one('.unit_grade .star_grade') if unit_grade: row_data['unit_grade'] = unit_grade.get('aria-label', '').strip() mean_list = row.select('.mean_list li') if mean_list: mean_data = [] for li in mean_list: word_class = li.select_one('span.word_class') mean = li.select_one('p.mean') word_class_text = word_class.get_text(strip=True) if word_class else '' mean_text = mean.get_text(strip=True) if mean else '' mean_r_text = remove_text(mean_text, word_class_text) mean_item = { "word_class": word_class_text, "mean": mean_r_text } if i == 0: # 첫 번째 row num = li.select_one('span.num') num_text = clean_num_text(num.get_text(strip=True)) if num else '' mean_item["num"] = num_text mean_data.append(mean_item) row_data['mean_list'] = mean_data row_data['is_first_row'] = (i == 0) # 첫 번째 row 여부 설정 word_data['rows'].append(row_data) # 결과 출력 및 저장 print(json.dumps(word_data, ensure_ascii=False, indent=4)) save_data(word_data, pid) driver.quit()