racisimnatehiggas commited on
Commit
4a3f851
·
verified ·
1 Parent(s): d2d356e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -140
app.py CHANGED
@@ -1,140 +1,2 @@
1
- import os
2
- import json
3
- from gnews import GNews
4
- from selenium import webdriver
5
- from selenium.webdriver.chrome.service import Service
6
- from selenium.webdriver.common.by import By
7
- from selenium.webdriver.chrome.options import Options
8
- import time
9
- from duckduckgo_search import DDGS
10
- from flask import Flask, Response, jsonify, request
11
-
12
- # Set the cache directory for webdriver_manager and ensure it exists
13
- os.environ["WDM_CACHE_DIR"] = "/app/.wdm"
14
- cache_dir = os.environ["WDM_CACHE_DIR"]
15
-
16
- # Ensure the cache directory exists and is writable
17
- if not os.path.exists(cache_dir):
18
- os.makedirs(cache_dir, exist_ok=True)
19
- os.chmod(cache_dir, 0o777) # Give read/write/execute permissions
20
-
21
- # Manually set the path to chromedriver
22
- CHROMEDRIVER_PATH = '/app/chromedriver'
23
-
24
- app = Flask(__name__)
25
-
26
- def summarize_webpage(text):
27
- try:
28
- print(f"Summarizing text: {text[:100]}") # Log first 100 characters
29
- results = DDGS().chat(f"Summarize the key details of the news article at least under 350 words: {text}", model='gpt-4o-mini')
30
- print(f"Summarized text: {results}") # Log summarized result
31
- return results
32
- except Exception as e:
33
- print(f"Error in summarization: {e}")
34
- return None
35
-
36
- def scrape_and_summarize_article(index, news, driver, summarized_articles, stream):
37
- try:
38
- # Get the URL of the news article
39
- url = news['url']
40
- print(f"Scraping Article {index + 1} URL: {url}")
41
-
42
- # Open the URL
43
- driver.get(url)
44
- time.sleep(2) # Allow some time for the page to load
45
-
46
- # Scrape the title of the article
47
- title = driver.title
48
- print(f"Article Title: {title}")
49
-
50
- # Extract main content with multiple approaches
51
- try:
52
- # First try to find main content containers
53
- main_content = driver.find_elements(By.CSS_SELECTOR,
54
- "article, .article, .content, main, #main-content, .post-content")
55
-
56
- if main_content:
57
- # Combine text from all matching elements
58
- content = ' '.join([elem.text for elem in main_content])
59
- else:
60
- # Fallback: Try to exclude common non-content areas
61
- body = driver.find_element(By.TAG_NAME, "body")
62
- # Remove navigation, header, footer, sidebar elements
63
- for elem in body.find_elements(By.CSS_SELECTOR,
64
- "nav, header, footer, .sidebar, .menu, .advertisement"):
65
- driver.execute_script("arguments[0].remove();", elem)
66
- content = body.text
67
-
68
- # Clean the content
69
- content = ' '.join(content.split()) # Remove extra whitespace
70
- except Exception as e:
71
- print(f"Error extracting content: {e}")
72
- content = ""
73
-
74
- if content:
75
- print(f"Extracted content length for article {index + 1}: {len(content)}")
76
-
77
- # Summarize the content
78
- summarized = summarize_webpage(content)
79
-
80
- if summarized:
81
- print(f"Summarized article {index + 1}: {summarized}")
82
- # Create a dictionary for the article
83
- article_summary = {
84
- "title": title,
85
- "url": url,
86
- "summary": summarized
87
- }
88
- summarized_articles.append(article_summary)
89
-
90
- # Streaming the result as JSON
91
- stream.write(f"data: {json.dumps(article_summary)}\n\n")
92
- stream.flush() # Ensure the data is sent immediately
93
- else:
94
- print(f"No summary available for article {index + 1}. Skipping.")
95
- else:
96
- print(f"No content found for article {index + 1}. Skipping.")
97
-
98
- except Exception as e:
99
- print(f"Error during scraping Article {index + 1}: {e}")
100
-
101
- @app.route('/scrape', methods=['GET'])
102
- def scrape_news():
103
- # Initialize the GNews object with parameters
104
- google_news = GNews(country="BD", language="en", period='3', max_results=35)
105
-
106
- # Fetch news articles related to a specific location
107
- news_list = google_news.get_news_by_location('BD')
108
-
109
- if not news_list:
110
- return jsonify({"error": "No news articles found."})
111
-
112
- # Set up Chrome options for low-resource usage
113
- chrome_options = Options()
114
- chrome_options.add_argument("--headless") # Run in headless mode
115
- chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration
116
- chrome_options.add_argument("--no-sandbox") # Required for Linux
117
- chrome_options.add_argument("--disable-dev-shm-usage") # Reduce memory usage
118
- chrome_options.add_argument("--disable-extensions") # Disable extensions
119
- chrome_options.add_argument("--disable-logging") # Disable logging
120
- chrome_options.add_argument("--window-size=1920x1080") # Set default window size
121
-
122
- # Create a Service object pointing to the manually downloaded ChromeDriver
123
- service = Service(CHROMEDRIVER_PATH)
124
-
125
- # Initialize the WebDriver with options
126
- driver = webdriver.Chrome(service=service, options=chrome_options)
127
-
128
- # Initialize a list to store summarized articles
129
- summarized_articles = []
130
-
131
- # Return an HTTP stream for Server-Sent Events (SSE)
132
- def generate_stream():
133
- for index, news in enumerate(news_list):
134
- scrape_and_summarize_article(index, news, driver, summarized_articles, response.stream)
135
- driver.quit() # Close the browser after processing all articles
136
-
137
- return Response(generate_stream(), content_type='text/event-stream')
138
-
139
- if __name__ == "__main__":
140
- app.run(host="0.0.0.0", port=7860, debug=True, threaded=True, use_reloader=False)
 
1
+ from g4f.api import run_api
2
+ run_api(port=7860)