news-server

Sleeping

App Files Files Community

racisimnatehiggas commited on Dec 29, 2024

Commit

4a3f851

verified ·

1 Parent(s): d2d356e

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -140

app.py CHANGED Viewed

@@ -1,140 +1,2 @@
-import os
-import json
-from gnews import GNews
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.options import Options
-import time
-from duckduckgo_search import DDGS
-from flask import Flask, Response, jsonify, request
-# Set the cache directory for webdriver_manager and ensure it exists
-os.environ["WDM_CACHE_DIR"] = "/app/.wdm"
-cache_dir = os.environ["WDM_CACHE_DIR"]
-# Ensure the cache directory exists and is writable
-if not os.path.exists(cache_dir):
-    os.makedirs(cache_dir, exist_ok=True)
-    os.chmod(cache_dir, 0o777)  # Give read/write/execute permissions
-# Manually set the path to chromedriver
-CHROMEDRIVER_PATH = '/app/chromedriver'
-app = Flask(__name__)
-def summarize_webpage(text):
-    try:
-        print(f"Summarizing text: {text[:100]}")  # Log first 100 characters
-        results = DDGS().chat(f"Summarize the key details of the news article at least under 350 words: {text}", model='gpt-4o-mini')
-        print(f"Summarized text: {results}")  # Log summarized result
-        return results
-    except Exception as e:
-        print(f"Error in summarization: {e}")
-        return None
-def scrape_and_summarize_article(index, news, driver, summarized_articles, stream):
-    try:
-        # Get the URL of the news article
-        url = news['url']
-        print(f"Scraping Article {index + 1} URL: {url}")
-        # Open the URL
-        driver.get(url)
-        time.sleep(2)  # Allow some time for the page to load
-        # Scrape the title of the article
-        title = driver.title
-        print(f"Article Title: {title}")
-        # Extract main content with multiple approaches
-        try:
-            # First try to find main content containers
-            main_content = driver.find_elements(By.CSS_SELECTOR,
-                "article, .article, .content, main, #main-content, .post-content")
-            if main_content:
-                # Combine text from all matching elements
-                content = ' '.join([elem.text for elem in main_content])
-            else:
-                # Fallback: Try to exclude common non-content areas
-                body = driver.find_element(By.TAG_NAME, "body")
-                # Remove navigation, header, footer, sidebar elements
-                for elem in body.find_elements(By.CSS_SELECTOR,
-                    "nav, header, footer, .sidebar, .menu, .advertisement"):
-                    driver.execute_script("arguments[0].remove();", elem)
-                content = body.text
-            # Clean the content
-            content = ' '.join(content.split())  # Remove extra whitespace
-        except Exception as e:
-            print(f"Error extracting content: {e}")
-            content = ""
-        if content:
-            print(f"Extracted content length for article {index + 1}: {len(content)}")
-            # Summarize the content
-            summarized = summarize_webpage(content)
-            if summarized:
-                print(f"Summarized article {index + 1}: {summarized}")
-                # Create a dictionary for the article
-                article_summary = {
-                    "title": title,
-                    "url": url,
-                    "summary": summarized
-                }
-                summarized_articles.append(article_summary)
-                # Streaming the result as JSON
-                stream.write(f"data: {json.dumps(article_summary)}\n\n")
-                stream.flush()  # Ensure the data is sent immediately
-            else:
-                print(f"No summary available for article {index + 1}. Skipping.")
-        else:
-            print(f"No content found for article {index + 1}. Skipping.")
-    except Exception as e:
-        print(f"Error during scraping Article {index + 1}: {e}")
-@app.route('/scrape', methods=['GET'])
-def scrape_news():
-    # Initialize the GNews object with parameters
-    google_news = GNews(country="BD", language="en", period='3', max_results=35)
-    # Fetch news articles related to a specific location
-    news_list = google_news.get_news_by_location('BD')
-    if not news_list:
-        return jsonify({"error": "No news articles found."})
-    # Set up Chrome options for low-resource usage
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")  # Run in headless mode
-    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
-    chrome_options.add_argument("--no-sandbox")  # Required for Linux
-    chrome_options.add_argument("--disable-dev-shm-usage")  # Reduce memory usage
-    chrome_options.add_argument("--disable-extensions")  # Disable extensions
-    chrome_options.add_argument("--disable-logging")  # Disable logging
-    chrome_options.add_argument("--window-size=1920x1080")  # Set default window size
-    # Create a Service object pointing to the manually downloaded ChromeDriver
-    service = Service(CHROMEDRIVER_PATH)
-    # Initialize the WebDriver with options
-    driver = webdriver.Chrome(service=service, options=chrome_options)
-    # Initialize a list to store summarized articles
-    summarized_articles = []
-    # Return an HTTP stream for Server-Sent Events (SSE)
-    def generate_stream():
-        for index, news in enumerate(news_list):
-            scrape_and_summarize_article(index, news, driver, summarized_articles, response.stream)
-        driver.quit()  # Close the browser after processing all articles
-    return Response(generate_stream(), content_type='text/event-stream')
-if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860, debug=True, threaded=True, use_reloader=False)


1	+ from g4f.api import run_api
2	+ run_api(port=7860)