File size: 4,486 Bytes
24b80cc 2a06dd2 24b80cc 2a06dd2 24b80cc 2a06dd2 24b80cc 2a06dd2 24b80cc 2a06dd2 24b80cc 2a06dd2 24b80cc 2a06dd2 24b80cc 8615b9e 2a06dd2 24b80cc 8615b9e 24b80cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse # iframe ๊ฒฝ๋ก๊ฐ ์๋๊ฒฝ๋ก์ผ ๊ฒฝ์ฐ ์ ๋๊ฒฝ๋ก๋ก ๋ง๋ค๊ธฐ ์ํด ์ฌ์ฉ
# ๋๋ฒ๊น
(๋ก๊ทธ)์ฉ ํจ์
def debug_log(message: str):
"""
๊ฐ๋จํ ๋๋ฒ๊น
(๋ก๊ทธ) ์ถ๋ ฅ์ ์ํ ํจ์
"""
print(f"[DEBUG] {message}")
def scrape_naver_blog(url: str) -> str:
"""
์ฃผ์ด์ง ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ URL์์
์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ์ถ์ถํ์ฌ ๋ฐํํฉ๋๋ค.
"""
debug_log("scrape_naver_blog ํจ์ ์์")
debug_log(f"์์ฒญ๋ฐ์ URL: {url}")
# ํค๋ ์ธํ
(ํฌ๋กค๋ง ์ฐจ๋จ ๋ฐฉ์ง ์ผ๋ถ ๋์)
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36"
)
}
try:
# 1) ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ '๋ฉ์ธ' ํ์ด์ง ์์ฒญ
response = requests.get(url, headers=headers)
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
# ์๋ต ์ํ์ฝ๋ ํ์ธ
if response.status_code != 200:
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
# BeautifulSoup ํ์ฑ (๋ฉ์ธ ํ์ด์ง)
soup = BeautifulSoup(response.text, "html.parser")
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
# 2) iframe ํ๊ทธ ์ฐพ๊ธฐ
iframe = soup.select_one("iframe#mainFrame")
if not iframe:
# iframe ์์ฒด๋ฅผ ์ฐพ์ง ๋ชปํ ๊ฒฝ์ฐ
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค."
iframe_src = iframe.get("src")
if not iframe_src:
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
# 3) iframe src๊ฐ ์๋๊ฒฝ๋ก์ธ ๊ฒฝ์ฐ ์ ๋๊ฒฝ๋ก๋ก ๋ณด์
# (์: //blog.naver.com/~~~ ์ ๊ฐ์ ๊ฒฝ์ฐ๋ฅผ ์ฒ๋ฆฌ)
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
# iframe ํ์ด์ง๋ก ์ฌ์์ฒญ
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
iframe_response = requests.get(parsed_iframe_url, headers=headers)
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
if iframe_response.status_code != 200:
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
# 4) iframe ํ์ด์ง ํ์ฑ
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
# ์ ๋ชฉ ์ถ์ถ
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
# ๋ณธ๋ฌธ ์ถ์ถ
content_div = iframe_soup.select_one('.se-main-container')
if content_div:
# ๋ณธ๋ฌธ์ \n ๊ธฐ์ค์ผ๋ก ๊ตฌ๋ถํด์ ์ข ๋ ๊น๋ํ๊ฒ ๋ง๋ค๊ธฐ
content = content_div.get_text("\n", strip=True)
else:
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
# ๊ฒฐ๊ณผ ํฉ์น๊ธฐ
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํฉ์ณ ๋ฐํ ์ค๋น ์๋ฃ")
return result
except Exception as e:
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
# Gradio ์ธํฐํ์ด์ค
def main_interface():
interface = gr.Interface(
fn=scrape_naver_blog,
inputs=gr.Textbox(
lines=1,
label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ",
placeholder="์: https://blog.naver.com/ssboost/222983068507"
),
outputs=gr.Textbox(label="๊ฒฐ๊ณผ"),
title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํผ",
description="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ๋ฅผ ์
๋ ฅํ๋ฉด ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ์ถ์ถํ์ฌ ํ์ํฉ๋๋ค."
)
return interface
if __name__ == "__main__":
debug_log("Gradio ์ฑ ์คํ ์์")
demo = main_interface()
demo.launch()
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ")
|