|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import urllib.parse |
|
|
|
|
|
def debug_log(message: str): |
|
""" |
|
๊ฐ๋จํ ๋๋ฒ๊น
(๋ก๊ทธ) ์ถ๋ ฅ์ ์ํ ํจ์ |
|
""" |
|
print(f"[DEBUG] {message}") |
|
|
|
def scrape_naver_blog(url: str) -> str: |
|
""" |
|
์ฃผ์ด์ง ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ URL์์ |
|
์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ์ถ์ถํ์ฌ ๋ฐํํฉ๋๋ค. |
|
""" |
|
debug_log("scrape_naver_blog ํจ์ ์์") |
|
debug_log(f"์์ฒญ๋ฐ์ URL: {url}") |
|
|
|
|
|
headers = { |
|
"User-Agent": ( |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|
"AppleWebKit/537.36 (KHTML, like Gecko) " |
|
"Chrome/96.0.4664.110 Safari/537.36" |
|
) |
|
} |
|
|
|
try: |
|
|
|
response = requests.get(url, headers=headers) |
|
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ") |
|
|
|
|
|
if response.status_code != 200: |
|
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}") |
|
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}" |
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ") |
|
|
|
|
|
iframe = soup.select_one("iframe#mainFrame") |
|
if not iframe: |
|
|
|
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.") |
|
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค." |
|
|
|
iframe_src = iframe.get("src") |
|
if not iframe_src: |
|
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.") |
|
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค." |
|
|
|
|
|
|
|
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src) |
|
|
|
|
|
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}") |
|
iframe_response = requests.get(parsed_iframe_url, headers=headers) |
|
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ") |
|
|
|
if iframe_response.status_code != 200: |
|
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}") |
|
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}" |
|
|
|
|
|
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser") |
|
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ") |
|
|
|
|
|
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text') |
|
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค." |
|
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}") |
|
|
|
|
|
content_div = iframe_soup.select_one('.se-main-container') |
|
if content_div: |
|
|
|
content = content_div.get_text("\n", strip=True) |
|
else: |
|
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค." |
|
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ") |
|
|
|
|
|
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}" |
|
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํฉ์ณ ๋ฐํ ์ค๋น ์๋ฃ") |
|
|
|
return result |
|
|
|
except Exception as e: |
|
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}") |
|
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}" |
|
|
|
|
|
|
|
def main_interface(): |
|
interface = gr.Interface( |
|
fn=scrape_naver_blog, |
|
inputs=gr.Textbox( |
|
lines=1, |
|
label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ", |
|
placeholder="์: https://blog.naver.com/ssboost/222983068507" |
|
), |
|
outputs=gr.Textbox(label="๊ฒฐ๊ณผ"), |
|
title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํผ", |
|
description="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ๋ฅผ ์
๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ์ถ์ถํ์ฌ ํ์ํฉ๋๋ค." |
|
) |
|
return interface |
|
|
|
if __name__ == "__main__": |
|
debug_log("Gradio ์ฑ ์คํ ์์") |
|
demo = main_interface() |
|
demo.launch() |
|
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ") |
|
|