AIRider commited on
Commit
4ecdb4b
ยท
verified ยท
1 Parent(s): 4431f41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -44
app.py CHANGED
@@ -1,59 +1,48 @@
1
  import gradio as gr
2
- from selenium import webdriver
3
- from selenium.webdriver.chrome.service import Service
4
- from selenium.webdriver.common.by import By
5
- from selenium.webdriver.chrome.options import Options
6
- from webdriver_manager.chrome import ChromeDriverManager
7
- import time
8
 
9
  def scrape_blog(url):
10
  debug_logs = [] # ๋””๋ฒ„๊น… ๋ฉ”์‹œ์ง€ ์ €์žฅ์šฉ
11
 
12
- # Selenium WebDriver ์„ค์ •
13
- chrome_options = Options()
14
- chrome_options.add_argument("--headless") # ๋ธŒ๋ผ์šฐ์ € ์ฐฝ์„ ๋„์šฐ์ง€ ์•Š์Œ
15
- chrome_options.add_argument("--no-sandbox")
16
- chrome_options.add_argument("--disable-dev-shm-usage")
17
-
18
- driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
19
- debug_logs.append("WebDriver initialized.")
20
-
21
  try:
22
- driver.get(url)
23
- debug_logs.append(f"Navigated to {url}")
24
-
25
- time.sleep(3) # ํŽ˜์ด์ง€ ๋กœ๋“œ ๋Œ€๊ธฐ
26
- debug_logs.append("Waited for page to load.")
27
-
28
- # ์ œ๋ชฉ ํฌ๋กค๋ง
29
- try:
30
- title_xpath = "/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[8]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div[1]/div/div/div[2]/div/p/span"
31
- title_element = driver.find_element(By.XPATH, title_xpath)
32
- title = title_element.text.strip()
33
- debug_logs.append(f"Title extracted: {title}")
34
- except Exception as e:
35
- title = "Error extracting title"
36
- debug_logs.append(f"Error extracting title: {e}")
37
-
38
- # ๋‚ด์šฉ ํฌ๋กค๋ง
39
- try:
40
- content_xpath = "/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[8]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div[3]/div[4]/div/div/div/p[1]/span"
41
- content_element = driver.find_element(By.XPATH, content_xpath)
42
- content = content_element.text.strip()
43
- debug_logs.append(f"Content extracted: {content}")
44
- except Exception as e:
45
- content = "Error extracting content"
46
- debug_logs.append(f"Error extracting content: {e}")
 
 
 
 
 
 
47
 
48
  except Exception as e:
49
  title = "Error accessing blog"
50
  content = "Error accessing blog"
51
  debug_logs.append(f"Error accessing blog: {e}")
52
 
53
- finally:
54
- driver.quit()
55
- debug_logs.append("WebDriver closed.")
56
-
57
  return {"title": title, "content": content, "debug_logs": debug_logs}
58
 
59
  def interface_function(url):
 
1
  import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
 
 
 
 
4
 
5
  def scrape_blog(url):
6
  debug_logs = [] # ๋””๋ฒ„๊น… ๋ฉ”์‹œ์ง€ ์ €์žฅ์šฉ
7
 
 
 
 
 
 
 
 
 
 
8
  try:
9
+ # HTTP ์š”์ฒญ ๋ณด๋‚ด๊ธฐ
10
+ response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
11
+ debug_logs.append(f"Request sent to {url}")
12
+
13
+ # ์‘๋‹ต ํ™•์ธ
14
+ if response.status_code == 200:
15
+ debug_logs.append("Successfully fetched the webpage.")
16
+ soup = BeautifulSoup(response.text, 'html.parser')
17
+
18
+ # ์ œ๋ชฉ ํฌ๋กค๋ง
19
+ try:
20
+ title_element = soup.select_one("div.se-title-text span")
21
+ title = title_element.get_text(strip=True) if title_element else "Title not found"
22
+ debug_logs.append(f"Title extracted: {title}")
23
+ except Exception as e:
24
+ title = "Error extracting title"
25
+ debug_logs.append(f"Error extracting title: {e}")
26
+
27
+ # ๋‚ด์šฉ ํฌ๋กค๋ง
28
+ try:
29
+ content_element = soup.select_one("div.se-main-container span")
30
+ content = content_element.get_text(strip=True) if content_element else "Content not found"
31
+ debug_logs.append(f"Content extracted: {content}")
32
+ except Exception as e:
33
+ content = "Error extracting content"
34
+ debug_logs.append(f"Error extracting content: {e}")
35
+
36
+ else:
37
+ title = "Error accessing blog"
38
+ content = "Error accessing blog"
39
+ debug_logs.append(f"Error accessing blog: Status code {response.status_code}")
40
 
41
  except Exception as e:
42
  title = "Error accessing blog"
43
  content = "Error accessing blog"
44
  debug_logs.append(f"Error accessing blog: {e}")
45
 
 
 
 
 
46
  return {"title": title, "content": content, "debug_logs": debug_logs}
47
 
48
  def interface_function(url):