110 lines
3.3 KiB
Python
110 lines
3.3 KiB
Python
import pymysql
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def fetch_words_batch(connection, start_pid, limit=100):
|
|
with connection.cursor() as cursor:
|
|
sql = "SELECT pid, word FROM ew_word WHERE pid >= %s ORDER BY pid ASC LIMIT %s"
|
|
cursor.execute(sql, (start_pid, limit))
|
|
result = cursor.fetchall()
|
|
return result # 결과를 사전의 리스트로 반환
|
|
|
|
|
|
def fetch_page(word):
|
|
# URL 설정
|
|
base_url = 'http://aha-dic.com/View.asp?word='
|
|
url = f"{base_url}{word}"
|
|
|
|
# HTTP GET 요청을 보내고 응답을 가져옴
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
return response.text
|
|
|
|
|
|
def parse_phonetic(html):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
phonetic_div = soup.find('div', class_='phonetic')
|
|
|
|
if phonetic_div:
|
|
# 'phonetic_div' 내의 텍스트 노드와 첫 번째 발음기호 추출
|
|
phonetic_text = ''
|
|
for element in phonetic_div:
|
|
if isinstance(element, str):
|
|
phonetic_text = element.strip()
|
|
if phonetic_text:
|
|
break
|
|
if phonetic_text:
|
|
return phonetic_text
|
|
|
|
return 'N/A'
|
|
|
|
|
|
def update_phonetic_symbol(connection, pid, phonetic_symbol):
|
|
with connection.cursor() as cursor:
|
|
sql = "UPDATE ew_word SET phonetic_symbol = %s WHERE pid = %s"
|
|
cursor.execute(sql, (phonetic_symbol, pid))
|
|
connection.commit()
|
|
|
|
|
|
def main():
|
|
# MySQL 연결 설정
|
|
connection = pymysql.connect(
|
|
host="syye.net",
|
|
user="pythonUser",
|
|
password="Tjekdfl1324%^",
|
|
db="English_words",
|
|
charset='utf8mb4',
|
|
autocommit=True,
|
|
cursorclass=pymysql.cursors.DictCursor
|
|
)
|
|
|
|
start_pid = 0
|
|
batch_size = 100
|
|
not_updated_words = []
|
|
|
|
try:
|
|
while True:
|
|
# 100개의 단어 가져오기
|
|
words = fetch_words_batch(connection, start_pid, batch_size)
|
|
if not words:
|
|
break
|
|
|
|
print(f"Fetched {len(words)} words from database starting from pid {start_pid}")
|
|
|
|
# 각 단어에 대해 발음기호 추출 및 업데이트
|
|
for word_entry in words:
|
|
pid = word_entry['pid']
|
|
word = word_entry['word']
|
|
print(f"\nFetching phonetic for '{word}' (pid: {pid})...")
|
|
html = fetch_page(word)
|
|
first_phonetic = parse_phonetic(html)
|
|
print(f"The first phonetic symbol for '{word}' is: {first_phonetic}")
|
|
|
|
# 발음기호를 데이터베이스에 업데이트
|
|
if first_phonetic == 'N/A':
|
|
not_updated_words.append(word)
|
|
else:
|
|
update_phonetic_symbol(connection, pid, first_phonetic)
|
|
print(f"Updated phonetic symbol for '{word}' (pid: {pid}) to '{first_phonetic}'")
|
|
|
|
# 다음 배치를 위해 start_pid 업데이트
|
|
start_pid = words[-1]['pid'] + 1
|
|
|
|
finally:
|
|
connection.close()
|
|
print("Database connection closed")
|
|
|
|
# 업데이트되지 않은 단어 출력
|
|
if not_updated_words:
|
|
print("\nWords that were not updated:")
|
|
for word in not_updated_words:
|
|
print(word)
|
|
else:
|
|
print("\nAll words were successfully updated.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|