englishtokorea/fetch_first_phonetic.py
2025-05-28 14:26:49 +09:00

110 lines
3.3 KiB
Python

import pymysql
import requests
from bs4 import BeautifulSoup
def fetch_words_batch(connection, start_pid, limit=100):
with connection.cursor() as cursor:
sql = "SELECT pid, word FROM ew_word WHERE pid >= %s ORDER BY pid ASC LIMIT %s"
cursor.execute(sql, (start_pid, limit))
result = cursor.fetchall()
return result # 결과를 사전의 리스트로 반환
def fetch_page(word):
# URL 설정
base_url = 'http://aha-dic.com/View.asp?word='
url = f"{base_url}{word}"
# HTTP GET 요청을 보내고 응답을 가져옴
response = requests.get(url)
response.raise_for_status()
return response.text
def parse_phonetic(html):
soup = BeautifulSoup(html, 'html.parser')
phonetic_div = soup.find('div', class_='phonetic')
if phonetic_div:
# 'phonetic_div' 내의 텍스트 노드와 첫 번째 발음기호 추출
phonetic_text = ''
for element in phonetic_div:
if isinstance(element, str):
phonetic_text = element.strip()
if phonetic_text:
break
if phonetic_text:
return phonetic_text
return 'N/A'
def update_phonetic_symbol(connection, pid, phonetic_symbol):
with connection.cursor() as cursor:
sql = "UPDATE ew_word SET phonetic_symbol = %s WHERE pid = %s"
cursor.execute(sql, (phonetic_symbol, pid))
connection.commit()
def main():
# MySQL 연결 설정
connection = pymysql.connect(
host="syye.net",
user="pythonUser",
password="Tjekdfl1324%^",
db="English_words",
charset='utf8mb4',
autocommit=True,
cursorclass=pymysql.cursors.DictCursor
)
start_pid = 0
batch_size = 100
not_updated_words = []
try:
while True:
# 100개의 단어 가져오기
words = fetch_words_batch(connection, start_pid, batch_size)
if not words:
break
print(f"Fetched {len(words)} words from database starting from pid {start_pid}")
# 각 단어에 대해 발음기호 추출 및 업데이트
for word_entry in words:
pid = word_entry['pid']
word = word_entry['word']
print(f"\nFetching phonetic for '{word}' (pid: {pid})...")
html = fetch_page(word)
first_phonetic = parse_phonetic(html)
print(f"The first phonetic symbol for '{word}' is: {first_phonetic}")
# 발음기호를 데이터베이스에 업데이트
if first_phonetic == 'N/A':
not_updated_words.append(word)
else:
update_phonetic_symbol(connection, pid, first_phonetic)
print(f"Updated phonetic symbol for '{word}' (pid: {pid}) to '{first_phonetic}'")
# 다음 배치를 위해 start_pid 업데이트
start_pid = words[-1]['pid'] + 1
finally:
connection.close()
print("Database connection closed")
# 업데이트되지 않은 단어 출력
if not_updated_words:
print("\nWords that were not updated:")
for word in not_updated_words:
print(word)
else:
print("\nAll words were successfully updated.")
if __name__ == '__main__':
main()