File size: 4,486 Bytes
24b80cc
 
 
2a06dd2
24b80cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a06dd2
24b80cc
2a06dd2
24b80cc
 
 
 
 
 
2a06dd2
24b80cc
2a06dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24b80cc
 
2a06dd2
24b80cc
 
 
 
2a06dd2
 
 
 
 
 
24b80cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8615b9e
2a06dd2
 
24b80cc
 
8615b9e
24b80cc
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse  # iframe ๊ฒฝ๋กœ๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ผ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋งŒ๋“ค๊ธฐ ์œ„ํ•ด ์‚ฌ์šฉ

# ๋””๋ฒ„๊น…(๋กœ๊ทธ)์šฉ ํ•จ์ˆ˜
def debug_log(message: str):
    """
    ๊ฐ„๋‹จํ•œ ๋””๋ฒ„๊น…(๋กœ๊ทธ) ์ถœ๋ ฅ์„ ์œ„ํ•œ ํ•จ์ˆ˜
    """
    print(f"[DEBUG] {message}")

def scrape_naver_blog(url: str) -> str:
    """
    ์ฃผ์–ด์ง„ ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL์—์„œ
    ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜์—ฌ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
    """
    debug_log("scrape_naver_blog ํ•จ์ˆ˜ ์‹œ์ž‘")
    debug_log(f"์š”์ฒญ๋ฐ›์€ URL: {url}")

    # ํ—ค๋” ์„ธํŒ…(ํฌ๋กค๋ง ์ฐจ๋‹จ ๋ฐฉ์ง€ ์ผ๋ถ€ ๋„์›€)
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/96.0.4664.110 Safari/537.36"
        )
    }

    try:
        # 1) ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ '๋ฉ”์ธ' ํŽ˜์ด์ง€ ์š”์ฒญ
        response = requests.get(url, headers=headers)
        debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")

        # ์‘๋‹ต ์ƒํƒœ์ฝ”๋“œ ํ™•์ธ
        if response.status_code != 200:
            debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
            return f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {response.status_code}"

        # BeautifulSoup ํŒŒ์‹ฑ (๋ฉ”์ธ ํŽ˜์ด์ง€)
        soup = BeautifulSoup(response.text, "html.parser")
        debug_log("HTML ํŒŒ์‹ฑ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")

        # 2) iframe ํƒœ๊ทธ ์ฐพ๊ธฐ
        iframe = soup.select_one("iframe#mainFrame")
        if not iframe:
            # iframe ์ž์ฒด๋ฅผ ์ฐพ์ง€ ๋ชปํ•œ ๊ฒฝ์šฐ
            debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
            return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."

        iframe_src = iframe.get("src")
        if not iframe_src:
            debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
            return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."

        # 3) iframe src๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ธ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋ณด์ •
        #    (์˜ˆ: //blog.naver.com/~~~ ์™€ ๊ฐ™์€ ๊ฒฝ์šฐ๋ฅผ ์ฒ˜๋ฆฌ)
        parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)

        # iframe ํŽ˜์ด์ง€๋กœ ์žฌ์š”์ฒญ
        debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
        iframe_response = requests.get(parsed_iframe_url, headers=headers)
        debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")

        if iframe_response.status_code != 200:
            debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
            return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"

        # 4) iframe ํŽ˜์ด์ง€ ํŒŒ์‹ฑ
        iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
        debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")

        # ์ œ๋ชฉ ์ถ”์ถœ
        title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
        title = title_div.get_text(strip=True) if title_div else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
        debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")

        # ๋ณธ๋ฌธ ์ถ”์ถœ
        content_div = iframe_soup.select_one('.se-main-container')
        if content_div:
            # ๋ณธ๋ฌธ์„ \n ๊ธฐ์ค€์œผ๋กœ ๊ตฌ๋ถ„ํ•ด์„œ ์ข€ ๋” ๊น”๋”ํ•˜๊ฒŒ ๋งŒ๋“ค๊ธฐ
            content = content_div.get_text("\n", strip=True)
        else:
            content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
        debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")

        # ๊ฒฐ๊ณผ ํ•ฉ์น˜๊ธฐ
        result = f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
        debug_log("์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ํ•ฉ์ณ ๋ฐ˜ํ™˜ ์ค€๋น„ ์™„๋ฃŒ")

        return result

    except Exception as e:
        debug_log(f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}")
        return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"


# Gradio ์ธํ„ฐํŽ˜์ด์Šค
def main_interface():
    interface = gr.Interface(
        fn=scrape_naver_blog,
        inputs=gr.Textbox(
            lines=1,
            label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ",
            placeholder="์˜ˆ: https://blog.naver.com/ssboost/222983068507"
        ),
        outputs=gr.Textbox(label="๊ฒฐ๊ณผ"),
        title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ",
        description="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜์—ฌ ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค."
    )
    return interface

if __name__ == "__main__":
    debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
    demo = main_interface()
    demo.launch()
    debug_log("Gradio ์•ฑ ์‹คํ–‰ ์ข…๋ฃŒ")