173 lines
6.3 KiB
Python
173 lines
6.3 KiB
Python
import os
|
|
import requests
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
import pymysql
|
|
|
|
def fetch_search_results(search_term):
|
|
# URL 설정
|
|
base_url = 'http://aha-dic.com/View.asp?word='
|
|
url = f"{base_url}{search_term}"
|
|
|
|
# HTTP GET 요청을 보내고 응답을 가져옴
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
return response.text
|
|
|
|
def parse_results(html, download_folder):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
results = {}
|
|
|
|
# 결과를 포함하는 div를 찾기
|
|
result_div = soup.find('div', id='container_result')
|
|
if result_div:
|
|
# class 'word' 텍스트 추출
|
|
word_span = result_div.find('span', class_='word')
|
|
results['Word'] = word_span.get_text(strip=True) if word_span else ''
|
|
|
|
# class 'phoneticKor' 텍스트 추출 및 HTML 수정
|
|
phonetic_kor_span = result_div.find('span', class_='phoneticKor')
|
|
if phonetic_kor_span:
|
|
for accent_span in phonetic_kor_span.find_all('span', class_='accent'):
|
|
accent_span.name = 'b' # <span class="accent"> 태그를 <b> 태그로 변경
|
|
accent_span.attrs = {} # 모든 속성을 제거
|
|
phonetic_kor_html = str(phonetic_kor_span)
|
|
phonetic_kor_html = phonetic_kor_html.replace('<span class="phoneticKor">', '').replace('</span>', '')
|
|
results['PhoneticKor'] = phonetic_kor_html
|
|
else:
|
|
results['PhoneticKor'] = ''
|
|
|
|
# class 'playSound middle'에서 mp3 url 추출
|
|
play_sound = result_div.find('span', class_='playSound middle')
|
|
mp3_url = play_sound['mp3'] if play_sound else ''
|
|
|
|
if mp3_url:
|
|
full_mp3_url = f"http://aha-dic.com{mp3_url}"
|
|
mp3_filename = os.path.basename(mp3_url)
|
|
download_file(full_mp3_url, os.path.join(download_folder, mp3_filename))
|
|
results['MP3_File'] = mp3_filename
|
|
else:
|
|
results['MP3_File'] = ''
|
|
|
|
# 여러 개의 ul li HTML 코드 추출
|
|
meanings = []
|
|
ul_elements = result_div.find_all('ul')
|
|
for ul in ul_elements:
|
|
li_elements = ul.find_all('li')
|
|
for li in li_elements:
|
|
meanings.append(li.get_text(strip=True))
|
|
results['Meanings'] = '; '.join(meanings)
|
|
|
|
# 예문 및 품사 패널 내용 추출
|
|
example_sentence = ''
|
|
part_of_speech = ''
|
|
panels = result_div.find_all('fieldset', class_='panel')
|
|
for panel in panels:
|
|
legend = panel.find('legend')
|
|
span = panel.find('span')
|
|
if legend and span:
|
|
if '예문' in legend.get_text(strip=True):
|
|
example_sentence = span.decode_contents().replace('<br><br>', '\n').strip()
|
|
elif '품사' in legend.get_text(strip=True):
|
|
part_of_speech = span.get_text(strip=True)
|
|
|
|
results['ExampleSentence'] = BeautifulSoup(example_sentence, 'html.parser').get_text(strip=True)
|
|
results['PartOfSpeech'] = part_of_speech
|
|
|
|
return results
|
|
|
|
def download_file(url, local_filename):
|
|
# MP3 파일을 다운로드하여 지정된 경로에 저장
|
|
try:
|
|
with requests.get(url, stream=True) as r:
|
|
r.raise_for_status()
|
|
with open(local_filename, 'wb') as f:
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
print(f"Downloaded: {local_filename}")
|
|
except Exception as e:
|
|
print(f"Failed to download {url}: {e}")
|
|
|
|
def save_to_database(data, connection):
|
|
with connection.cursor() as cursor:
|
|
sql = """
|
|
INSERT INTO ew_word (level, word, phonetic_kor, phonetic_symbol, mp3_file, meanings, examplesentence, partofspeech)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|
"""
|
|
values = (
|
|
'1', # level은 1로 고정
|
|
data['Word'],
|
|
data['PhoneticKor'],
|
|
'', # phonetic_symbol은 빈 값으로 설정
|
|
data['MP3_File'],
|
|
data['Meanings'],
|
|
data['ExampleSentence'],
|
|
data['PartOfSpeech']
|
|
)
|
|
cursor.execute(sql, values)
|
|
connection.commit()
|
|
|
|
def main():
|
|
# 검색어가 포함된 CSV 파일 경로
|
|
search_terms_file = 'search_terms.csv' # CSV 파일 경로 설정
|
|
download_folder = r'D:\_SUNGRO_DEV\python_project\english_project\mp3' # 다운로드할 폴더 경로 설정
|
|
|
|
if not os.path.exists(download_folder):
|
|
os.makedirs(download_folder)
|
|
|
|
# MySQL 연결 설정
|
|
connection = pymysql.connect(
|
|
host="syye.net",
|
|
user="pythonUser",
|
|
password="Tjekdfl1324%^",
|
|
db="English_words",
|
|
charset='utf8mb4',
|
|
autocommit=True
|
|
)
|
|
|
|
failed_words = []
|
|
|
|
try:
|
|
# CSV 파일에서 검색어 불러오기
|
|
search_terms_df = pd.read_csv(search_terms_file)
|
|
print("CSV 파일의 내용:")
|
|
print(search_terms_df.head()) # CSV 파일 내용 확인
|
|
if 'search_term' not in search_terms_df.columns:
|
|
raise KeyError("CSV 파일에 'search_term' 열이 없습니다.")
|
|
|
|
search_terms = search_terms_df['search_term']
|
|
|
|
all_results = []
|
|
|
|
# 검색어 루프 돌면서 검색 및 결과 저장
|
|
for i, search_term in enumerate(search_terms, start=1):
|
|
print(f"Processing {i}/{len(search_terms)}: {search_term}")
|
|
try:
|
|
html = fetch_search_results(search_term)
|
|
result = parse_results(html, download_folder)
|
|
result['SearchTerm'] = search_term # 검색어도 결과에 포함
|
|
all_results.append(result)
|
|
print(f"Finished processing {search_term}")
|
|
|
|
# 데이터베이스에 저장
|
|
save_to_database(result, connection)
|
|
print(f"Saved {search_term} to database")
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {search_term}: {e}")
|
|
failed_words.append({'word': search_term, 'error': str(e)})
|
|
|
|
finally:
|
|
connection.close()
|
|
print("Database connection closed")
|
|
|
|
# 저장하지 못한 단어 출력
|
|
if failed_words:
|
|
print("\nFailed to save the following words:")
|
|
for entry in failed_words:
|
|
print(f"Word: {entry['word']}, Error: {entry['error']}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|