Spaces:

girishwangikar
/

SmolAgent_News_Analysis

Running

App Files Files Community

girishwangikar commited on 28 days ago

Commit

fd1209e

verified ·

1 Parent(s): 5e6d4d2

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -45

app.py CHANGED Viewed

@@ -3,17 +3,12 @@ import pandas as pd
 from smolagents import CodeAgent, tool
 from typing import Union, List, Dict
 from duckduckgo_search import DDGS
-from newspaper import Article
 from datetime import datetime, timedelta
-import nltk
 from groq import Groq
 import os
-# Download required NLTK data
-nltk.download('punkt')
-nltk.download('averaged_perceptron_tagger')
-nltk.download('maxent_ne_chunker')
-nltk.download('words')
 class GroqLLM:
     """Compatible LLM interface for smolagents CodeAgent"""
@@ -68,6 +63,32 @@ class NewsAnalysisAgent(CodeAgent):
         """
         return super().run(enhanced_prompt)
 @tool
 def search_news(query: str, max_results: int = 5) -> str:
     """Search for recent news articles using DuckDuckGo.
@@ -107,37 +128,34 @@ def analyze_article(url: str) -> str:
         url: URL of the news article to analyze
     Returns:
-        str: Analysis of the article including summary, key points, and entities
     """
     try:
-        # Download and parse article
-        article = Article(url)
-        article.download()
-        article.parse()
-        article.nlp()
         # Store article data
         article_data = {
             'url': url,
-            'title': article.title,
-            'summary': article.summary,
-            'keywords': article.keywords,
-            'publish_date': article.publish_date
         }
         tool.agent._articles.append(article_data)
-        # Format analysis
-        analysis = f"""
-        Title: {article.title}
-        Summary: {article.summary}
-        Key Points:
-        {', '.join(article.keywords)}
-        Publication Date: {article.publish_date}
-        """
         return analysis
     except Exception as e:
         return f"Error analyzing article: {str(e)}"
@@ -157,24 +175,20 @@ def identify_trends(articles: List[Dict] = None) -> str:
     if not articles:
         return "No articles available for trend analysis"
-    # Collect all keywords
-    all_keywords = []
-    for article in articles:
-        all_keywords.extend(article.get('keywords', []))
-    # Count keyword frequencies
-    keyword_freq = pd.Series(all_keywords).value_counts()
-    # Format trends analysis
-    trends = f"""
-    Common Themes:
-    {', '.join(keyword_freq.head().index)}
-    Articles Analyzed: {len(articles)}
-    Timespan: {min(a['publish_date'] for a in articles if a.get('publish_date'))} to {max(a['publish_date'] for a in articles if a.get('publish_date'))}
     """
-    return trends
 def main():
     st.title("News Analysis Assistant")
@@ -186,7 +200,7 @@ def main():
             tools=[search_news, analyze_article, identify_trends],
             model=GroqLLM(),
             additional_authorized_imports=[
-                "newspaper", "nltk", "duckduckgo_search", "pandas"
             ]
         )

 from smolagents import CodeAgent, tool
 from typing import Union, List, Dict
 from duckduckgo_search import DDGS
+import requests
+from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
 from groq import Groq
 import os
+import re
 class GroqLLM:
     """Compatible LLM interface for smolagents CodeAgent"""
         """
         return super().run(enhanced_prompt)
+def extract_text_from_url(url: str) -> str:
+    """Helper function to extract text content from a URL using BeautifulSoup"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove scripts, styles, and navigation elements
+        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
+            element.decompose()
+        # Extract text from paragraphs
+        paragraphs = soup.find_all('p')
+        text = ' '.join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
+        # Basic cleaning
+        text = re.sub(r'\s+', ' ', text)
+        return text
+    except Exception as e:
+        return f"Error extracting text: {str(e)}"
 @tool
 def search_news(query: str, max_results: int = 5) -> str:
     """Search for recent news articles using DuckDuckGo.
         url: URL of the news article to analyze
     Returns:
+        str: Analysis of the article including summary and key points
     """
     try:
+        # Extract text content
+        content = extract_text_from_url(url)
+        # Use LLM to generate summary and analysis
+        analysis_prompt = f"""
+        Please analyze this article content and provide:
+        1. A brief summary (2-3 sentences)
+        2. Key points (3-5 main takeaways)
+        3. Main topics/themes discussed
+        Article content:
+        {content[:3000]}  # Limit content length for token constraints
+        """
+        analysis = tool.agent.model(analysis_prompt)
         # Store article data
         article_data = {
             'url': url,
+            'content': content[:1000],  # Store truncated content
+            'analysis': analysis,
+            'date': datetime.now().strftime('%Y-%m-%d')
         }
         tool.agent._articles.append(article_data)
         return analysis
     except Exception as e:
         return f"Error analyzing article: {str(e)}"
     if not articles:
         return "No articles available for trend analysis"
+    # Combine all analyses for trend identification
+    combined_analyses = "\n".join(article['analysis'] for article in articles)
+    trend_prompt = f"""
+    Based on the analyses of {len(articles)} articles, please identify:
+    1. Common themes or topics across articles
+    2. Any notable patterns or trends
+    3. Different perspectives or viewpoints presented
+    Combined analyses:
+    {combined_analyses}
     """
+    return tool.agent.model(trend_prompt)
 def main():
     st.title("News Analysis Assistant")
             tools=[search_news, analyze_article, identify_trends],
             model=GroqLLM(),
             additional_authorized_imports=[
+                "requests", "bs4", "duckduckgo_search", "pandas"
             ]
         )