englishtokorea/naver_english.py

import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import json  # JSON 라이브러리 가져오기

# MySQL 연결 설정
connection = pymysql.connect(
    host="syye.net",
    user="pythonUser",
    password="Tjekdfl1324%^",
    db="English_words",
    charset='utf8mb4',
    autocommit=True,
    cursorclass=pymysql.cursors.DictCursor
)


# 단어 데이터를 MySQL에서 가져오기
def get_words():
    with connection.cursor() as cursor:
        sql = "SELECT pid, word FROM ew_word LIMIT 20"
        cursor.execute(sql)
        result = cursor.fetchall()
    return result


# 데이터를 DB에 저장
def save_data(word_data, ew_word_pid):
    with connection.cursor() as cursor:
        for row in word_data["rows"]:
            origin = row.get("origin", "")
            unit_grade = row.get("unit_grade", "")
            is_first_row = row.get("is_first_row", False)
            for mean_item in row.get("mean_list", []):
                mean_num = mean_item.get("num", "")
                word_class = mean_item.get("word_class", "")
                mean = mean_item.get("mean", "")
                sql = """
                INSERT INTO ew_word_details (ew_word_pid, origin, unit_grade, mean_num, word_class, mean, is_first_row)
                VALUES (%s, %s, %s, %s, %s, %s, %s)
                """
                cursor.execute(sql, (ew_word_pid, origin, unit_grade, mean_num, word_class, mean, is_first_row))


words = get_words()

# 셀레늄 웹드라이버 설정
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)


def remove_text(original_string, text_to_remove):
    result_string = original_string.replace(text_to_remove, "")
    return result_string


def clean_text(text):
    return text.replace('\t', '').replace('\n', '').strip()


def clean_num_text(text):
    return text.replace('.', '').strip()


# 각 단어를 네이버 사전에서 조회
for word_entry in words:
    pid = word_entry['pid']
    word = word_entry['word']
    url = f'https://en.dict.naver.com/#/search?query={word}'
    driver.get(url)

    # 페이지 로드 대기
    time.sleep(3)

    # 페이지 소스 가져오기
    html = driver.page_source

    # BeautifulSoup을 사용하여 HTML 파싱
    soup = BeautifulSoup(html, 'html.parser')

    word_data = {
        "word": word,
        "rows": []
    }

    first_component = soup.select_one('.component_keyword')
    if first_component:
        rows = first_component.select('.row')
        for i, row in enumerate(rows):
            row_data = {}
            origin = row.select_one('.origin a')
            if origin:
                row_data['origin'] = clean_text(origin.get_text(strip=True))

            unit_grade = row.select_one('.unit_grade .star_grade')
            if unit_grade:
                row_data['unit_grade'] = unit_grade.get('aria-label', '').strip()

            mean_list = row.select('.mean_list li')
            if mean_list:
                mean_data = []
                for li in mean_list:
                    word_class = li.select_one('span.word_class')
                    mean = li.select_one('p.mean')

                    word_class_text = word_class.get_text(strip=True) if word_class else ''
                    mean_text = mean.get_text(strip=True) if mean else ''
                    mean_r_text = remove_text(mean_text, word_class_text)

                    mean_item = {
                        "word_class": word_class_text,
                        "mean": mean_r_text
                    }

                    if i == 0:  # 첫 번째 row
                        num = li.select_one('span.num')
                        num_text = clean_num_text(num.get_text(strip=True)) if num else ''
                        mean_item["num"] = num_text

                    mean_data.append(mean_item)
                row_data['mean_list'] = mean_data
                row_data['is_first_row'] = (i == 0)  # 첫 번째 row 여부 설정

            word_data['rows'].append(row_data)

    # 결과 출력 및 저장
    print(json.dumps(word_data, ensure_ascii=False, indent=4))
    save_data(word_data, pid)

driver.quit()