Spaces:
Sleeping
Sleeping
racisimnatehiggas
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,140 +1,2 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
from gnews import GNews
|
4 |
-
from selenium import webdriver
|
5 |
-
from selenium.webdriver.chrome.service import Service
|
6 |
-
from selenium.webdriver.common.by import By
|
7 |
-
from selenium.webdriver.chrome.options import Options
|
8 |
-
import time
|
9 |
-
from duckduckgo_search import DDGS
|
10 |
-
from flask import Flask, Response, jsonify, request
|
11 |
-
|
12 |
-
# Set the cache directory for webdriver_manager and ensure it exists
|
13 |
-
os.environ["WDM_CACHE_DIR"] = "/app/.wdm"
|
14 |
-
cache_dir = os.environ["WDM_CACHE_DIR"]
|
15 |
-
|
16 |
-
# Ensure the cache directory exists and is writable
|
17 |
-
if not os.path.exists(cache_dir):
|
18 |
-
os.makedirs(cache_dir, exist_ok=True)
|
19 |
-
os.chmod(cache_dir, 0o777) # Give read/write/execute permissions
|
20 |
-
|
21 |
-
# Manually set the path to chromedriver
|
22 |
-
CHROMEDRIVER_PATH = '/app/chromedriver'
|
23 |
-
|
24 |
-
app = Flask(__name__)
|
25 |
-
|
26 |
-
def summarize_webpage(text):
|
27 |
-
try:
|
28 |
-
print(f"Summarizing text: {text[:100]}") # Log first 100 characters
|
29 |
-
results = DDGS().chat(f"Summarize the key details of the news article at least under 350 words: {text}", model='gpt-4o-mini')
|
30 |
-
print(f"Summarized text: {results}") # Log summarized result
|
31 |
-
return results
|
32 |
-
except Exception as e:
|
33 |
-
print(f"Error in summarization: {e}")
|
34 |
-
return None
|
35 |
-
|
36 |
-
def scrape_and_summarize_article(index, news, driver, summarized_articles, stream):
|
37 |
-
try:
|
38 |
-
# Get the URL of the news article
|
39 |
-
url = news['url']
|
40 |
-
print(f"Scraping Article {index + 1} URL: {url}")
|
41 |
-
|
42 |
-
# Open the URL
|
43 |
-
driver.get(url)
|
44 |
-
time.sleep(2) # Allow some time for the page to load
|
45 |
-
|
46 |
-
# Scrape the title of the article
|
47 |
-
title = driver.title
|
48 |
-
print(f"Article Title: {title}")
|
49 |
-
|
50 |
-
# Extract main content with multiple approaches
|
51 |
-
try:
|
52 |
-
# First try to find main content containers
|
53 |
-
main_content = driver.find_elements(By.CSS_SELECTOR,
|
54 |
-
"article, .article, .content, main, #main-content, .post-content")
|
55 |
-
|
56 |
-
if main_content:
|
57 |
-
# Combine text from all matching elements
|
58 |
-
content = ' '.join([elem.text for elem in main_content])
|
59 |
-
else:
|
60 |
-
# Fallback: Try to exclude common non-content areas
|
61 |
-
body = driver.find_element(By.TAG_NAME, "body")
|
62 |
-
# Remove navigation, header, footer, sidebar elements
|
63 |
-
for elem in body.find_elements(By.CSS_SELECTOR,
|
64 |
-
"nav, header, footer, .sidebar, .menu, .advertisement"):
|
65 |
-
driver.execute_script("arguments[0].remove();", elem)
|
66 |
-
content = body.text
|
67 |
-
|
68 |
-
# Clean the content
|
69 |
-
content = ' '.join(content.split()) # Remove extra whitespace
|
70 |
-
except Exception as e:
|
71 |
-
print(f"Error extracting content: {e}")
|
72 |
-
content = ""
|
73 |
-
|
74 |
-
if content:
|
75 |
-
print(f"Extracted content length for article {index + 1}: {len(content)}")
|
76 |
-
|
77 |
-
# Summarize the content
|
78 |
-
summarized = summarize_webpage(content)
|
79 |
-
|
80 |
-
if summarized:
|
81 |
-
print(f"Summarized article {index + 1}: {summarized}")
|
82 |
-
# Create a dictionary for the article
|
83 |
-
article_summary = {
|
84 |
-
"title": title,
|
85 |
-
"url": url,
|
86 |
-
"summary": summarized
|
87 |
-
}
|
88 |
-
summarized_articles.append(article_summary)
|
89 |
-
|
90 |
-
# Streaming the result as JSON
|
91 |
-
stream.write(f"data: {json.dumps(article_summary)}\n\n")
|
92 |
-
stream.flush() # Ensure the data is sent immediately
|
93 |
-
else:
|
94 |
-
print(f"No summary available for article {index + 1}. Skipping.")
|
95 |
-
else:
|
96 |
-
print(f"No content found for article {index + 1}. Skipping.")
|
97 |
-
|
98 |
-
except Exception as e:
|
99 |
-
print(f"Error during scraping Article {index + 1}: {e}")
|
100 |
-
|
101 |
-
@app.route('/scrape', methods=['GET'])
|
102 |
-
def scrape_news():
|
103 |
-
# Initialize the GNews object with parameters
|
104 |
-
google_news = GNews(country="BD", language="en", period='3', max_results=35)
|
105 |
-
|
106 |
-
# Fetch news articles related to a specific location
|
107 |
-
news_list = google_news.get_news_by_location('BD')
|
108 |
-
|
109 |
-
if not news_list:
|
110 |
-
return jsonify({"error": "No news articles found."})
|
111 |
-
|
112 |
-
# Set up Chrome options for low-resource usage
|
113 |
-
chrome_options = Options()
|
114 |
-
chrome_options.add_argument("--headless") # Run in headless mode
|
115 |
-
chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration
|
116 |
-
chrome_options.add_argument("--no-sandbox") # Required for Linux
|
117 |
-
chrome_options.add_argument("--disable-dev-shm-usage") # Reduce memory usage
|
118 |
-
chrome_options.add_argument("--disable-extensions") # Disable extensions
|
119 |
-
chrome_options.add_argument("--disable-logging") # Disable logging
|
120 |
-
chrome_options.add_argument("--window-size=1920x1080") # Set default window size
|
121 |
-
|
122 |
-
# Create a Service object pointing to the manually downloaded ChromeDriver
|
123 |
-
service = Service(CHROMEDRIVER_PATH)
|
124 |
-
|
125 |
-
# Initialize the WebDriver with options
|
126 |
-
driver = webdriver.Chrome(service=service, options=chrome_options)
|
127 |
-
|
128 |
-
# Initialize a list to store summarized articles
|
129 |
-
summarized_articles = []
|
130 |
-
|
131 |
-
# Return an HTTP stream for Server-Sent Events (SSE)
|
132 |
-
def generate_stream():
|
133 |
-
for index, news in enumerate(news_list):
|
134 |
-
scrape_and_summarize_article(index, news, driver, summarized_articles, response.stream)
|
135 |
-
driver.quit() # Close the browser after processing all articles
|
136 |
-
|
137 |
-
return Response(generate_stream(), content_type='text/event-stream')
|
138 |
-
|
139 |
-
if __name__ == "__main__":
|
140 |
-
app.run(host="0.0.0.0", port=7860, debug=True, threaded=True, use_reloader=False)
|
|
|
1 |
+
from g4f.api import run_api
|
2 |
+
run_api(port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|