Spaces:

gpt-99
/

reddit_search

Sleeping

App Files Files Community

gpt-99 commited on Dec 8, 2024

Commit

613e758

1 Parent(s): e40a948

initial commit

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +372 -0
requirements.txt +81 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Reddit Search
-emoji: 📉
 colorFrom: purple
 colorTo: pink
 sdk: gradio

 ---
 title: Reddit Search
+emoji: 🔍
 colorFrom: purple
 colorTo: pink
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import requests
+import pandas as pd
+import time
+from datetime import datetime
+from dotenv import load_dotenv
+import os
+import gradio as gr
+load_dotenv()
+XAI_API_KEY = os.getenv("XAI_API_KEY")
+# Global variable to store the most recent analysis results
+GLOBAL_ANALYSIS_STORAGE = {
+    'subreddit': None,
+    'data': None
+}
+def call_LLM(query):
+    return call_groq(query)
+def call_groq(query):
+    from groq import Groq
+    client = Groq()
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "system", "content": query}
+        ],
+        model="llama3-8b-8192",
+        temperature=0.5,
+        max_tokens=1024,
+        top_p=1,
+        stop=None,
+        stream=False,
+    )
+    return chat_completion.choices[0].message.content
+def process(row):
+    """
+    Format this so that the model sees full post for now
+    """
+    # title
+    # comment_body
+    prompt = f"The below is a reddit post. Take a look and tell me if there is a business problem to be solved here ||| title: {row['post_title']} ||| comment: {row['comment_body']}"
+    return call_LLM(prompt)
+# ... [Keep previous helper functions like extract_comment_data, fetch_top_comments, fetch_subreddits, fetch_top_posts] ...
+def extract_comment_data(comment, post_info):
+    """Extract relevant data from a comment"""
+    return {
+        'subreddit': post_info['subreddit'],
+        'post_title': post_info['title'],
+        'post_score': post_info['score'],
+        'post_created_utc': post_info['created_utc'],
+        'comment_id': comment['data'].get('id'),
+        'comment_author': comment['data'].get('author'),
+        'comment_body': comment['data'].get('body'),
+        'comment_score': comment['data'].get('score', 0),
+        'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)),
+        'post_url': post_info['url'],
+        'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}",
+    }
+def fetch_top_comments(post_df, num_comments=2):
+    """
+    Fetch top comments for each post in the dataframe, sorted by upvotes
+    """
+    all_comments = []
+    total_posts = len(post_df)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...")
+    for idx, post in post_df.iterrows():
+        print(f"\nProcessing post {idx + 1}/{total_posts}")
+        print(f"Title: {post['title'][:100]}...")
+        print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}")
+        try:
+            json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json'
+            url = f'https://www.reddit.com{json_url}'
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+            if len(data) > 1:
+                comments_data = data[1]['data']['children']
+                # Filter out non-comment entries and extract scores
+                valid_comments = [
+                    comment for comment in comments_data
+                    if comment['kind'] == 't1' and comment['data'].get('score') is not None
+                ]
+                # Sort comments by score (upvotes) in descending order
+                sorted_comments = sorted(
+                    valid_comments,
+                    key=lambda x: x['data'].get('score', 0),
+                    reverse=True
+                )
+                # Take only the top N comments
+                top_comments = sorted_comments[:num_comments]
+                # Print comment scores for verification
+                print("\nTop comment scores for this post:")
+                for i, comment in enumerate(top_comments, 1):
+                    score = comment['data'].get('score', 0)
+                    print(f"Comment {i}: {score} upvotes")
+                # Add to main list
+                for comment in top_comments:
+                    all_comments.append(extract_comment_data(comment, post))
+            time.sleep(2)
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching comments for post {idx + 1}: {e}")
+            continue
+    # Create DataFrame and sort
+    comments_df = pd.DataFrame(all_comments)
+    if not comments_df.empty:
+        # Verify sorting by showing top comments for each post
+        print("\nVerification of comment sorting:")
+        for post_title in comments_df['post_title'].unique():
+            post_comments = comments_df[comments_df['post_title'] == post_title]
+            print(f"\nPost: {post_title[:100]}...")
+            print("Comment scores:", post_comments['comment_score'].tolist())
+    return comments_df
+def fetch_subreddits(limit=10, min_subscribers=1000):
+    """
+    Fetch subreddits from Reddit
+    Args:
+        limit (int): Number of subreddits to fetch
+        min_subscribers (int): Minimum number of subscribers required
+    """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    subreddits_data = []
+    after = None
+    while len(subreddits_data) < limit:
+        try:
+            url = f'https://www.reddit.com/subreddits/popular.json?limit=100'
+            if after:
+                url += f'&after={after}'
+            print(f"Fetching subreddits... Current count: {len(subreddits_data)}")
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+            for subreddit in data['data']['children']:
+                subreddit_data = subreddit['data']
+                if subreddit_data.get('subscribers', 0) >= min_subscribers:
+                    sub_info = {
+                        'display_name': subreddit_data.get('display_name'),
+                        'display_name_prefixed': subreddit_data.get('display_name_prefixed'),
+                        'title': subreddit_data.get('title'),
+                        'subscribers': subreddit_data.get('subscribers', 0),
+                        'active_users': subreddit_data.get('active_user_count', 0),
+                        'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)),
+                        'description': subreddit_data.get('description'),
+                        'subreddit_type': subreddit_data.get('subreddit_type'),
+                        'over18': subreddit_data.get('over18', False),
+                        'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/"
+                    }
+                    subreddits_data.append(sub_info)
+            after = data['data'].get('after')
+            if not after:
+                print("Reached end of listings")
+                break
+            time.sleep(2)
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching data: {e}")
+            break
+    return pd.DataFrame(subreddits_data)
+def fetch_top_posts(subreddit, limit=5):
+    """
+    Fetch top posts from a subreddit using Reddit's JSON API
+    Args:
+        subreddit (str): Name of the subreddit without the 'r/'
+        limit (int): Maximum number of posts to fetch
+    Returns:
+        list: List of post dictionaries
+    """
+    posts_data = []
+    url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}'
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    try:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        data = response.json()
+        for post in data['data']['children']:
+            post_data = post['data']
+            posts_data.append({
+                'subreddit': subreddit,
+                'title': post_data.get('title'),
+                'score': post_data.get('score'),
+                'num_comments': post_data.get('num_comments'),
+                'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)),
+                'url': post_data.get('url'),
+                'permalink': 'https://www.reddit.com' + post_data.get('permalink', '')
+            })
+        time.sleep(2)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching posts from r/{subreddit}: {e}")
+    return pd.DataFrame(posts_data)
+def show_dataframe(subreddit):
+    # Fetch top posts
+    top_posts = fetch_top_posts(subreddit)
+    # Fetch top comments for these posts
+    data_to_analyze = fetch_top_comments(top_posts)
+    # Process and analyze each comment
+    responses = []
+    for _, row in data_to_analyze.iterrows():
+        print(f"{_} done")
+        responses.append(process(row))
+    # Add analysis to the dataframe
+    data_to_analyze['analysis'] = responses
+    # Store in global storage for quick access
+    GLOBAL_ANALYSIS_STORAGE['subreddit'] = subreddit
+    GLOBAL_ANALYSIS_STORAGE['data'] = data_to_analyze
+    return data_to_analyze
+def launch_interface():
+    # Fetch list of subreddits for user to choose from
+    sub_reddits = fetch_subreddits()
+    subreddit_list = sub_reddits["display_name"].tolist()
+    # Create Gradio Blocks for more flexible interface
+    with gr.Blocks() as demo:
+        # Title and description
+        gr.Markdown("# Reddit Business Problem Analyzer")
+        gr.Markdown("Discover potential business opportunities from Reddit discussions")
+        # Subreddit selection
+        subreddit_dropdown = gr.Dropdown(
+            choices=subreddit_list,
+            label="Select Subreddit",
+            info="Choose a subreddit to analyze"
+        )
+        # Outputs
+        with gr.Row():
+            with gr.Column():
+                # Overall Analysis Section
+                gr.Markdown("## Overall Analysis")
+                # overall_analysis = gr.Textbox(
+                #     label="Aggregated Business Insights",
+                #     interactive=False,
+                #     lines=5
+                # )
+                # Results Table
+                results_table = gr.Dataframe(
+                    label="Analysis Results",
+                    headers=["Index", "Post Title", "Comment", "Analysis"],
+                    interactive=False
+                )
+                # Row Selection
+                row_index = gr.Number(
+                    label="Select Row Index for Detailed View",
+                    precision=0
+                )
+            with gr.Column():
+                # Detailed Post Analysis
+                gr.Markdown("## Detailed Post Analysis")
+                detailed_analysis = gr.Markdown(
+                    label="Detailed Insights"
+                )
+        # Function to update posts when subreddit is selected
+        def update_posts(subreddit):
+            # Fetch and analyze data
+            data_to_analyze = show_dataframe(subreddit)
+            # Prepare table data
+            table_data = data_to_analyze[['post_title', 'comment_body', 'analysis']].reset_index()
+            table_data.columns = ['Index', 'Post Title', 'Comment', 'Analysis']
+            return table_data, None
+        # Function to show detailed analysis for a specific row
+        def show_row_details(row_index):
+            # Ensure we have data loaded
+            if GLOBAL_ANALYSIS_STORAGE['data'] is None:
+                return "Please select a subreddit first."
+            try:
+                # Convert to integer and subtract 1 (since index is 0-based)
+                row_index = int(row_index)
+                # Retrieve the specific row
+                row_data = GLOBAL_ANALYSIS_STORAGE['data'].loc[row_index]
+                # Format detailed view
+                detailed_view = f"""
+                ### Post Details
+                **Title:** {row_data.get('post_title', 'N/A')}
+                **Comment:** {row_data.get('comment_body', 'N/A')}
+                **Comment Score:** {row_data.get('comment_score', 'N/A')}
+                **Analysis:** {row_data.get('analysis', 'No analysis available')}
+                **Post URL:** {row_data.get('post_url', 'N/A')}
+                **Comment URL:** {row_data.get('comment_url', 'N/A')}
+                """
+                return detailed_view
+            except (KeyError, ValueError, TypeError) as e:
+                return f"Error retrieving row details: {str(e)}"
+        # Event Listeners
+        subreddit_dropdown.change(
+            fn=update_posts,
+            inputs=subreddit_dropdown,
+            outputs=[results_table, detailed_analysis]
+        )
+        row_index.change(
+            fn=show_row_details,
+            inputs=row_index,
+            outputs=detailed_analysis
+        )
+    return demo
+# Launch the interface
+if __name__ == "__main__":
+    interface = launch_interface()
+    interface.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,81 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+appnope==0.1.4
+asttokens==2.4.1
+audioop-lts==0.2.1
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+comm==0.2.2
+debugpy==1.8.7
+decorator==5.1.1
+distro==1.9.0
+executing==2.1.0
+fastapi==0.115.4
+ffmpy==0.4.0
+filelock==3.16.1
+fsspec==2024.10.0
+gradio==5.4.0
+gradio_client==1.4.2
+groq==0.13.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.29.0
+jedi==0.19.1
+Jinja2==3.1.4
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+nest-asyncio==1.6.0
+numpy==2.1.2
+ollama==0.3.3
+orjson==3.10.10
+packaging==24.1
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.0.0
+platformdirs==4.3.6
+prompt_toolkit==3.0.48
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.12
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+requests==2.32.3
+rich==13.9.4
+ruff==0.7.2
+safehttpx==0.1.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.41.2
+tomlkit==0.12.0
+tornado==6.4.1
+tqdm==4.66.6
+traitlets==5.14.3
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.0
+wcwidth==0.2.13
+websockets==12.0