N-blog / app.py
Kims12's picture
Update app.py
2a06dd2 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse # iframe ๊ฒฝ๋กœ๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ผ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋งŒ๋“ค๊ธฐ ์œ„ํ•ด ์‚ฌ์šฉ
# ๋””๋ฒ„๊น…(๋กœ๊ทธ)์šฉ ํ•จ์ˆ˜
def debug_log(message: str):
"""
๊ฐ„๋‹จํ•œ ๋””๋ฒ„๊น…(๋กœ๊ทธ) ์ถœ๋ ฅ์„ ์œ„ํ•œ ํ•จ์ˆ˜
"""
print(f"[DEBUG] {message}")
def scrape_naver_blog(url: str) -> str:
"""
์ฃผ์–ด์ง„ ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL์—์„œ
์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜์—ฌ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
"""
debug_log("scrape_naver_blog ํ•จ์ˆ˜ ์‹œ์ž‘")
debug_log(f"์š”์ฒญ๋ฐ›์€ URL: {url}")
# ํ—ค๋” ์„ธํŒ…(ํฌ๋กค๋ง ์ฐจ๋‹จ ๋ฐฉ์ง€ ์ผ๋ถ€ ๋„์›€)
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36"
)
}
try:
# 1) ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ '๋ฉ”์ธ' ํŽ˜์ด์ง€ ์š”์ฒญ
response = requests.get(url, headers=headers)
debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
# ์‘๋‹ต ์ƒํƒœ์ฝ”๋“œ ํ™•์ธ
if response.status_code != 200:
debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
return f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {response.status_code}"
# BeautifulSoup ํŒŒ์‹ฑ (๋ฉ”์ธ ํŽ˜์ด์ง€)
soup = BeautifulSoup(response.text, "html.parser")
debug_log("HTML ํŒŒ์‹ฑ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
# 2) iframe ํƒœ๊ทธ ์ฐพ๊ธฐ
iframe = soup.select_one("iframe#mainFrame")
if not iframe:
# iframe ์ž์ฒด๋ฅผ ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ
debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
iframe_src = iframe.get("src")
if not iframe_src:
debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
# 3) iframe src๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ธ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋ณด์ •
# (์˜ˆ: //blog.naver.com/~~~ ์™€ ๊ฐ™์€ ๊ฒฝ์šฐ๋ฅผ ์ฒ˜๋ฆฌ)
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
# iframe ํŽ˜์ด์ง€๋กœ ์žฌ์š”์ฒญ
debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
iframe_response = requests.get(parsed_iframe_url, headers=headers)
debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
if iframe_response.status_code != 200:
debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"
# 4) iframe ํŽ˜์ด์ง€ ํŒŒ์‹ฑ
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
# ์ œ๋ชฉ ์ถ”์ถœ
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
title = title_div.get_text(strip=True) if title_div else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
# ๋ณธ๋ฌธ ์ถ”์ถœ
content_div = iframe_soup.select_one('.se-main-container')
if content_div:
# ๋ณธ๋ฌธ์„ \n ๊ธฐ์ค€์œผ๋กœ ๊ตฌ๋ถ„ํ•ด์„œ ์ข€ ๋” ๊น”๋”ํ•˜๊ฒŒ ๋งŒ๋“ค๊ธฐ
content = content_div.get_text("\n", strip=True)
else:
content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")
# ๊ฒฐ๊ณผ ํ•ฉ์น˜๊ธฐ
result = f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
debug_log("์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ํ•ฉ์ณ ๋ฐ˜ํ™˜ ์ค€๋น„ ์™„๋ฃŒ")
return result
except Exception as e:
debug_log(f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}")
return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
# Gradio ์ธํ„ฐํŽ˜์ด์Šค
def main_interface():
interface = gr.Interface(
fn=scrape_naver_blog,
inputs=gr.Textbox(
lines=1,
label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ",
placeholder="์˜ˆ: https://blog.naver.com/ssboost/222983068507"
),
outputs=gr.Textbox(label="๊ฒฐ๊ณผ"),
title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ",
description="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜์—ฌ ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค."
)
return interface
if __name__ == "__main__":
debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
demo = main_interface()
demo.launch()
debug_log("Gradio ์•ฑ ์‹คํ–‰ ์ข…๋ฃŒ")