import pymysql import requests from bs4 import BeautifulSoup def fetch_words_batch(connection, start_pid, limit=100): with connection.cursor() as cursor: sql = "SELECT pid, word FROM ew_word WHERE pid >= %s ORDER BY pid ASC LIMIT %s" cursor.execute(sql, (start_pid, limit)) result = cursor.fetchall() return result # 결과를 사전의 리스트로 반환 def fetch_page(word): # URL 설정 base_url = 'http://aha-dic.com/View.asp?word=' url = f"{base_url}{word}" # HTTP GET 요청을 보내고 응답을 가져옴 response = requests.get(url) response.raise_for_status() return response.text def parse_phonetic(html): soup = BeautifulSoup(html, 'html.parser') phonetic_div = soup.find('div', class_='phonetic') if phonetic_div: # 'phonetic_div' 내의 텍스트 노드와 첫 번째 발음기호 추출 phonetic_text = '' for element in phonetic_div: if isinstance(element, str): phonetic_text = element.strip() if phonetic_text: break if phonetic_text: return phonetic_text return 'N/A' def update_phonetic_symbol(connection, pid, phonetic_symbol): with connection.cursor() as cursor: sql = "UPDATE ew_word SET phonetic_symbol = %s WHERE pid = %s" cursor.execute(sql, (phonetic_symbol, pid)) connection.commit() def main(): # MySQL 연결 설정 connection = pymysql.connect( host="syye.net", user="pythonUser", password="Tjekdfl1324%^", db="English_words", charset='utf8mb4', autocommit=True, cursorclass=pymysql.cursors.DictCursor ) start_pid = 0 batch_size = 100 not_updated_words = [] try: while True: # 100개의 단어 가져오기 words = fetch_words_batch(connection, start_pid, batch_size) if not words: break print(f"Fetched {len(words)} words from database starting from pid {start_pid}") # 각 단어에 대해 발음기호 추출 및 업데이트 for word_entry in words: pid = word_entry['pid'] word = word_entry['word'] print(f"\nFetching phonetic for '{word}' (pid: {pid})...") html = fetch_page(word) first_phonetic = parse_phonetic(html) print(f"The first phonetic symbol for '{word}' is: {first_phonetic}") # 발음기호를 데이터베이스에 업데이트 if first_phonetic == 'N/A': not_updated_words.append(word) else: update_phonetic_symbol(connection, pid, first_phonetic) print(f"Updated phonetic symbol for '{word}' (pid: {pid}) to '{first_phonetic}'") # 다음 배치를 위해 start_pid 업데이트 start_pid = words[-1]['pid'] + 1 finally: connection.close() print("Database connection closed") # 업데이트되지 않은 단어 출력 if not_updated_words: print("\nWords that were not updated:") for word in not_updated_words: print(word) else: print("\nAll words were successfully updated.") if __name__ == '__main__': main()