Spaces:

gpt-99
/

reddit_search

Sleeping

App Files Files Community

gpt-99 commited on Dec 9, 2024

Commit

bcd2144

1 Parent(s): e9ae3a8

fixing user agent + local data

Browse files

Files changed (5) hide show

.gitignore +4 -1
app.py +7 -192
comments.csv +0 -0
save.py +201 -0
subreddits.csv +0 -0

.gitignore CHANGED Viewed

	@@ -1 +1,4 @@
1	- .env

+.env
+__pycache__/*
+.gradio/*
+venv/

app.py CHANGED Viewed

@@ -45,202 +45,17 @@ def process(row):
     prompt = f"The below is a reddit post. Take a look and tell me if there is a business problem to be solved here ||| title: {row['post_title']} ||| comment: {row['comment_body']}"
     return call_LLM(prompt)
-# ... [Keep previous helper functions like extract_comment_data, fetch_top_comments, fetch_subreddits, fetch_top_posts] ...
-def extract_comment_data(comment, post_info):
-    """Extract relevant data from a comment"""
-    return {
-        'subreddit': post_info['subreddit'],
-        'post_title': post_info['title'],
-        'post_score': post_info['score'],
-        'post_created_utc': post_info['created_utc'],
-        'comment_id': comment['data'].get('id'),
-        'comment_author': comment['data'].get('author'),
-        'comment_body': comment['data'].get('body'),
-        'comment_score': comment['data'].get('score', 0),
-        'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)),
-        'post_url': post_info['url'],
-        'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}",
-    }
-def fetch_top_comments(post_df, num_comments=2):
-    """
-    Fetch top comments for each post in the dataframe, sorted by upvotes
-    """
-    all_comments = []
-    total_posts = len(post_df)
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-    }
-    print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...")
-    for idx, post in post_df.iterrows():
-        print(f"\nProcessing post {idx + 1}/{total_posts}")
-        print(f"Title: {post['title'][:100]}...")
-        print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}")
-        try:
-            json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json'
-            url = f'https://www.reddit.com{json_url}'
-            response = requests.get(url, headers=headers)
-            response.raise_for_status()
-            data = response.json()
-            if len(data) > 1:
-                comments_data = data[1]['data']['children']
-                # Filter out non-comment entries and extract scores
-                valid_comments = [
-                    comment for comment in comments_data
-                    if comment['kind'] == 't1' and comment['data'].get('score') is not None
-                ]
-                # Sort comments by score (upvotes) in descending order
-                sorted_comments = sorted(
-                    valid_comments,
-                    key=lambda x: x['data'].get('score', 0),
-                    reverse=True
-                )
-                # Take only the top N comments
-                top_comments = sorted_comments[:num_comments]
-                # Print comment scores for verification
-                print("\nTop comment scores for this post:")
-                for i, comment in enumerate(top_comments, 1):
-                    score = comment['data'].get('score', 0)
-                    print(f"Comment {i}: {score} upvotes")
-                # Add to main list
-                for comment in top_comments:
-                    all_comments.append(extract_comment_data(comment, post))
-            time.sleep(2)
-        except requests.exceptions.RequestException as e:
-            print(f"Error fetching comments for post {idx + 1}: {e}")
-            continue
-    # Create DataFrame and sort
-    comments_df = pd.DataFrame(all_comments)
-    if not comments_df.empty:
-        # Verify sorting by showing top comments for each post
-        print("\nVerification of comment sorting:")
-        for post_title in comments_df['post_title'].unique():
-            post_comments = comments_df[comments_df['post_title'] == post_title]
-            print(f"\nPost: {post_title[:100]}...")
-            print("Comment scores:", post_comments['comment_score'].tolist())
-    return comments_df
-def fetch_subreddits(limit=10, min_subscribers=1000):
-    """
-    Fetch subreddits from Reddit
-    Args:
-        limit (int): Number of subreddits to fetch
-        min_subscribers (int): Minimum number of subscribers required
-    """
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-    }
-    subreddits_data = []
-    after = None
-    while len(subreddits_data) < limit:
-        try:
-            url = f'https://www.reddit.com/subreddits/popular.json?limit=100'
-            if after:
-                url += f'&after={after}'
-            print(f"Fetching subreddits... Current count: {len(subreddits_data)}")
-            response = requests.get(url, headers=headers)
-            response.raise_for_status()
-            data = response.json()
-            for subreddit in data['data']['children']:
-                subreddit_data = subreddit['data']
-                if subreddit_data.get('subscribers', 0) >= min_subscribers:
-                    sub_info = {
-                        'display_name': subreddit_data.get('display_name'),
-                        'display_name_prefixed': subreddit_data.get('display_name_prefixed'),
-                        'title': subreddit_data.get('title'),
-                        'subscribers': subreddit_data.get('subscribers', 0),
-                        'active_users': subreddit_data.get('active_user_count', 0),
-                        'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)),
-                        'description': subreddit_data.get('description'),
-                        'subreddit_type': subreddit_data.get('subreddit_type'),
-                        'over18': subreddit_data.get('over18', False),
-                        'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/"
-                    }
-                    subreddits_data.append(sub_info)
-            after = data['data'].get('after')
-            if not after:
-                print("Reached end of listings")
-                break
-            time.sleep(2)
-        except requests.exceptions.RequestException as e:
-            print(f"Error fetching data: {e}")
-            break
-    return pd.DataFrame(subreddits_data)
-def fetch_top_posts(subreddit, limit=5):
-    """
-    Fetch top posts from a subreddit using Reddit's JSON API
-    Args:
-        subreddit (str): Name of the subreddit without the 'r/'
-        limit (int): Maximum number of posts to fetch
-    Returns:
-        list: List of post dictionaries
-    """
-    posts_data = []
-    url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}'
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-    }
-    try:
-        response = requests.get(url, headers=headers)
-        response.raise_for_status()
-        data = response.json()
-        for post in data['data']['children']:
-            post_data = post['data']
-            posts_data.append({
-                'subreddit': subreddit,
-                'title': post_data.get('title'),
-                'score': post_data.get('score'),
-                'num_comments': post_data.get('num_comments'),
-                'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)),
-                'url': post_data.get('url'),
-                'permalink': 'https://www.reddit.com' + post_data.get('permalink', '')
-            })
-        time.sleep(2)
-    except requests.exceptions.RequestException as e:
-        print(f"Error fetching posts from r/{subreddit}: {e}")
-    return pd.DataFrame(posts_data)
 def show_dataframe(subreddit):
-    # Fetch top posts
-    top_posts = fetch_top_posts(subreddit)
     # Fetch top comments for these posts
-    data_to_analyze = fetch_top_comments(top_posts)
     # Process and analyze each comment
     responses = []

     prompt = f"The below is a reddit post. Take a look and tell me if there is a business problem to be solved here ||| title: {row['post_title']} ||| comment: {row['comment_body']}"
     return call_LLM(prompt)
+def fetch_top_comments(subreddit):
+    df = pd.read_csv('comments.csv')
+    filtered_df = df[df['subreddit'] == subreddit]
+    return filtered_df
+def fetch_subreddits():
+    return pd.read_csv('subreddits.csv')
 def show_dataframe(subreddit):
     # Fetch top comments for these posts
+    data_to_analyze = fetch_top_comments(subreddit)
     # Process and analyze each comment
     responses = []

comments.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

save.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import time
+import requests
+import pandas as pd
+from datetime import datetime
+def extract_comment_data(comment, post_info):
+    return {
+        'subreddit': post_info['subreddit'],
+        'post_title': post_info['title'],
+        'post_score': post_info['score'],
+        'post_created_utc': post_info['created_utc'],
+        'comment_id': comment['data'].get('id'),
+        'comment_author': comment['data'].get('author'),
+        'comment_body': comment['data'].get('body'),
+        'comment_score': comment['data'].get('score', 0),
+        'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)),
+        'post_url': post_info['url'],
+        'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}",
+    }
+def fetch_top_comments(post_df, num_comments=2):
+    all_comments = []
+    total_posts = len(post_df)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
+    }
+    print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...")
+    for idx, post in post_df.iterrows():
+        print(f"\nProcessing post {idx + 1}/{total_posts}")
+        print(f"Title: {post['title'][:100]}...")
+        print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}")
+        try:
+            json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json'
+            url = f'https://www.reddit.com{json_url}'
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+            if len(data) > 1:
+                comments_data = data[1]['data']['children']
+                # Filter out non-comment entries and extract scores
+                valid_comments = [
+                    comment for comment in comments_data
+                    if comment['kind'] == 't1' and comment['data'].get('score') is not None
+                ]
+                # Sort comments by score (upvotes) in descending order
+                sorted_comments = sorted(
+                    valid_comments,
+                    key=lambda x: x['data'].get('score', 0),
+                    reverse=True
+                )
+                # Take only the top N comments
+                top_comments = sorted_comments[:num_comments]
+                # Print comment scores for verification
+                print("\nTop comment scores for this post:")
+                for i, comment in enumerate(top_comments, 1):
+                    score = comment['data'].get('score', 0)
+                    print(f"Comment {i}: {score} upvotes")
+                # Add to main list
+                for comment in top_comments:
+                    all_comments.append(extract_comment_data(comment, post))
+            time.sleep(20)
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching comments for post {idx + 1}: {e}")
+            continue
+    # Create DataFrame and sort
+    comments_df = pd.DataFrame(all_comments)
+    if not comments_df.empty:
+        # Verify sorting by showing top comments for each post
+        print("\nVerification of comment sorting:")
+        for post_title in comments_df['post_title'].unique():
+            post_comments = comments_df[comments_df['post_title'] == post_title]
+            print(f"\nPost: {post_title[:100]}...")
+            print("Comment scores:", post_comments['comment_score'].tolist())
+    return comments_df
+def fetch_subreddits(limit=10, min_subscribers=1000):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
+    }
+    subreddits_data = []
+    after = None
+    while len(subreddits_data) < limit:
+        try:
+            url = f'https://www.reddit.com/subreddits/popular.json?limit=100'
+            if after:
+                url += f'&after={after}'
+            print(f"Fetching subreddits... Current count: {len(subreddits_data)}")
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+            for subreddit in data['data']['children']:
+                subreddit_data = subreddit['data']
+                if subreddit_data.get('subscribers', 0) >= min_subscribers:
+                    sub_info = {
+                        'display_name': subreddit_data.get('display_name'),
+                        'display_name_prefixed': subreddit_data.get('display_name_prefixed'),
+                        'title': subreddit_data.get('title'),
+                        'subscribers': subreddit_data.get('subscribers', 0),
+                        'active_users': subreddit_data.get('active_user_count', 0),
+                        'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)),
+                        'description': subreddit_data.get('description'),
+                        'subreddit_type': subreddit_data.get('subreddit_type'),
+                        'over18': subreddit_data.get('over18', False),
+                        'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/"
+                    }
+                    subreddits_data.append(sub_info)
+            after = data['data'].get('after')
+            if not after:
+                print("Reached end of listings")
+                break
+            time.sleep(2)
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching data: {e}")
+            break
+    return pd.DataFrame(subreddits_data)
+def fetch_top_posts(subreddit, limit=5):
+    posts_data = []
+    url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}'
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
+    }
+    try:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        data = response.json()
+        for post in data['data']['children']:
+            post_data = post['data']
+            posts_data.append({
+                'subreddit': subreddit,
+                'title': post_data.get('title'),
+                'score': post_data.get('score'),
+                'num_comments': post_data.get('num_comments'),
+                'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)),
+                'url': post_data.get('url'),
+                'permalink': 'https://www.reddit.com' + post_data.get('permalink', '')
+            })
+        time.sleep(2)
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching posts from r/{subreddit}: {e}")
+    return pd.DataFrame(posts_data)
+def main():
+    # Step 1: Fetch Subreddits
+    print("Fetching subreddits...")
+    subreddits_df = fetch_subreddits(limit=10, min_subscribers=1000)
+    print(f"Fetched {len(subreddits_df)} subreddits.")
+    subreddits_df.to_csv("subreddits.csv")
+    # # Step 2: Fetch Top Posts for each subreddit
+    all_posts_data = []
+    for subreddit in subreddits_df['display_name']:
+        print(f"\nFetching top posts for subreddit: {subreddit}...")
+        posts_df = fetch_top_posts(subreddit, limit=5)
+        all_posts_data.append(posts_df)
+    # Combine all posts into a single DataFrame
+    posts_df = pd.concat(all_posts_data, ignore_index=True)
+    print(f"Fetched {len(posts_df)} top posts.")
+    posts_df.to_csv("posts.csv")
+    posts_df = pd.read_csv("posts.csv")
+    # Step 3: Fetch Top Comments for each post
+    all_comments_data = []
+    if not posts_df.empty:
+        all_comments_data = fetch_top_comments(posts_df, num_comments=2)
+        print(f"Fetched {len(all_comments_data)} top comments.")
+    all_comments_data.to_csv("comments.csv")
+if __name__ == "__main__":
+    main()

subreddits.csv ADDED Viewed

The diff for this file is too large to render. See raw diff