gpt-99 commited on
Commit
bcd2144
·
1 Parent(s): e9ae3a8

fixing user agent + local data

Browse files
Files changed (5) hide show
  1. .gitignore +4 -1
  2. app.py +7 -192
  3. comments.csv +0 -0
  4. save.py +201 -0
  5. subreddits.csv +0 -0
.gitignore CHANGED
@@ -1 +1,4 @@
1
- .env
 
 
 
 
1
+ .env
2
+ __pycache__/*
3
+ .gradio/*
4
+ venv/
app.py CHANGED
@@ -45,202 +45,17 @@ def process(row):
45
  prompt = f"The below is a reddit post. Take a look and tell me if there is a business problem to be solved here ||| title: {row['post_title']} ||| comment: {row['comment_body']}"
46
  return call_LLM(prompt)
47
 
48
- # ... [Keep previous helper functions like extract_comment_data, fetch_top_comments, fetch_subreddits, fetch_top_posts] ...
49
-
50
- def extract_comment_data(comment, post_info):
51
- """Extract relevant data from a comment"""
52
- return {
53
- 'subreddit': post_info['subreddit'],
54
- 'post_title': post_info['title'],
55
- 'post_score': post_info['score'],
56
- 'post_created_utc': post_info['created_utc'],
57
- 'comment_id': comment['data'].get('id'),
58
- 'comment_author': comment['data'].get('author'),
59
- 'comment_body': comment['data'].get('body'),
60
- 'comment_score': comment['data'].get('score', 0),
61
- 'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)),
62
- 'post_url': post_info['url'],
63
- 'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}",
64
- }
65
-
66
- def fetch_top_comments(post_df, num_comments=2):
67
- """
68
- Fetch top comments for each post in the dataframe, sorted by upvotes
69
- """
70
- all_comments = []
71
- total_posts = len(post_df)
72
- headers = {
73
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
74
- }
75
-
76
- print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...")
77
-
78
- for idx, post in post_df.iterrows():
79
- print(f"\nProcessing post {idx + 1}/{total_posts}")
80
- print(f"Title: {post['title'][:100]}...")
81
- print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}")
82
-
83
- try:
84
- json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json'
85
- url = f'https://www.reddit.com{json_url}'
86
-
87
- response = requests.get(url, headers=headers)
88
- response.raise_for_status()
89
- data = response.json()
90
-
91
- if len(data) > 1:
92
- comments_data = data[1]['data']['children']
93
-
94
- # Filter out non-comment entries and extract scores
95
- valid_comments = [
96
- comment for comment in comments_data
97
- if comment['kind'] == 't1' and comment['data'].get('score') is not None
98
- ]
99
-
100
- # Sort comments by score (upvotes) in descending order
101
- sorted_comments = sorted(
102
- valid_comments,
103
- key=lambda x: x['data'].get('score', 0),
104
- reverse=True
105
- )
106
-
107
- # Take only the top N comments
108
- top_comments = sorted_comments[:num_comments]
109
-
110
- # Print comment scores for verification
111
- print("\nTop comment scores for this post:")
112
- for i, comment in enumerate(top_comments, 1):
113
- score = comment['data'].get('score', 0)
114
- print(f"Comment {i}: {score} upvotes")
115
-
116
- # Add to main list
117
- for comment in top_comments:
118
- all_comments.append(extract_comment_data(comment, post))
119
-
120
- time.sleep(2)
121
-
122
- except requests.exceptions.RequestException as e:
123
- print(f"Error fetching comments for post {idx + 1}: {e}")
124
- continue
125
-
126
- # Create DataFrame and sort
127
- comments_df = pd.DataFrame(all_comments)
128
-
129
- if not comments_df.empty:
130
- # Verify sorting by showing top comments for each post
131
- print("\nVerification of comment sorting:")
132
- for post_title in comments_df['post_title'].unique():
133
- post_comments = comments_df[comments_df['post_title'] == post_title]
134
- print(f"\nPost: {post_title[:100]}...")
135
- print("Comment scores:", post_comments['comment_score'].tolist())
136
-
137
- return comments_df
138
-
139
-
140
- def fetch_subreddits(limit=10, min_subscribers=1000):
141
- """
142
- Fetch subreddits from Reddit
143
-
144
- Args:
145
- limit (int): Number of subreddits to fetch
146
- min_subscribers (int): Minimum number of subscribers required
147
- """
148
- headers = {
149
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
150
- }
151
- subreddits_data = []
152
- after = None
153
-
154
- while len(subreddits_data) < limit:
155
- try:
156
- url = f'https://www.reddit.com/subreddits/popular.json?limit=100'
157
- if after:
158
- url += f'&after={after}'
159
-
160
- print(f"Fetching subreddits... Current count: {len(subreddits_data)}")
161
- response = requests.get(url, headers=headers)
162
- response.raise_for_status()
163
- data = response.json()
164
-
165
- for subreddit in data['data']['children']:
166
- subreddit_data = subreddit['data']
167
-
168
- if subreddit_data.get('subscribers', 0) >= min_subscribers:
169
- sub_info = {
170
- 'display_name': subreddit_data.get('display_name'),
171
- 'display_name_prefixed': subreddit_data.get('display_name_prefixed'),
172
- 'title': subreddit_data.get('title'),
173
- 'subscribers': subreddit_data.get('subscribers', 0),
174
- 'active_users': subreddit_data.get('active_user_count', 0),
175
- 'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)),
176
- 'description': subreddit_data.get('description'),
177
- 'subreddit_type': subreddit_data.get('subreddit_type'),
178
- 'over18': subreddit_data.get('over18', False),
179
- 'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/"
180
- }
181
- subreddits_data.append(sub_info)
182
-
183
- after = data['data'].get('after')
184
- if not after:
185
- print("Reached end of listings")
186
- break
187
-
188
- time.sleep(2)
189
-
190
- except requests.exceptions.RequestException as e:
191
- print(f"Error fetching data: {e}")
192
- break
193
-
194
- return pd.DataFrame(subreddits_data)
195
-
196
- def fetch_top_posts(subreddit, limit=5):
197
- """
198
- Fetch top posts from a subreddit using Reddit's JSON API
199
-
200
- Args:
201
- subreddit (str): Name of the subreddit without the 'r/'
202
- limit (int): Maximum number of posts to fetch
203
-
204
- Returns:
205
- list: List of post dictionaries
206
- """
207
- posts_data = []
208
- url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}'
209
- headers = {
210
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
211
- }
212
-
213
- try:
214
- response = requests.get(url, headers=headers)
215
- response.raise_for_status()
216
- data = response.json()
217
-
218
- for post in data['data']['children']:
219
- post_data = post['data']
220
- posts_data.append({
221
- 'subreddit': subreddit,
222
- 'title': post_data.get('title'),
223
- 'score': post_data.get('score'),
224
- 'num_comments': post_data.get('num_comments'),
225
- 'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)),
226
- 'url': post_data.get('url'),
227
- 'permalink': 'https://www.reddit.com' + post_data.get('permalink', '')
228
- })
229
-
230
- time.sleep(2)
231
-
232
- except requests.exceptions.RequestException as e:
233
- print(f"Error fetching posts from r/{subreddit}: {e}")
234
-
235
- return pd.DataFrame(posts_data)
236
 
 
 
237
 
238
  def show_dataframe(subreddit):
239
- # Fetch top posts
240
- top_posts = fetch_top_posts(subreddit)
241
-
242
  # Fetch top comments for these posts
243
- data_to_analyze = fetch_top_comments(top_posts)
244
 
245
  # Process and analyze each comment
246
  responses = []
 
45
  prompt = f"The below is a reddit post. Take a look and tell me if there is a business problem to be solved here ||| title: {row['post_title']} ||| comment: {row['comment_body']}"
46
  return call_LLM(prompt)
47
 
48
+ def fetch_top_comments(subreddit):
49
+ df = pd.read_csv('comments.csv')
50
+ filtered_df = df[df['subreddit'] == subreddit]
51
+ return filtered_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ def fetch_subreddits():
54
+ return pd.read_csv('subreddits.csv')
55
 
56
  def show_dataframe(subreddit):
 
 
 
57
  # Fetch top comments for these posts
58
+ data_to_analyze = fetch_top_comments(subreddit)
59
 
60
  # Process and analyze each comment
61
  responses = []
comments.csv ADDED
The diff for this file is too large to render. See raw diff
 
save.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ import pandas as pd
4
+ from datetime import datetime
5
+
6
+ def extract_comment_data(comment, post_info):
7
+ return {
8
+ 'subreddit': post_info['subreddit'],
9
+ 'post_title': post_info['title'],
10
+ 'post_score': post_info['score'],
11
+ 'post_created_utc': post_info['created_utc'],
12
+ 'comment_id': comment['data'].get('id'),
13
+ 'comment_author': comment['data'].get('author'),
14
+ 'comment_body': comment['data'].get('body'),
15
+ 'comment_score': comment['data'].get('score', 0),
16
+ 'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)),
17
+ 'post_url': post_info['url'],
18
+ 'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}",
19
+ }
20
+
21
+ def fetch_top_comments(post_df, num_comments=2):
22
+ all_comments = []
23
+ total_posts = len(post_df)
24
+ headers = {
25
+ 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
26
+ }
27
+
28
+ print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...")
29
+
30
+ for idx, post in post_df.iterrows():
31
+ print(f"\nProcessing post {idx + 1}/{total_posts}")
32
+ print(f"Title: {post['title'][:100]}...")
33
+ print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}")
34
+
35
+ try:
36
+ json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json'
37
+ url = f'https://www.reddit.com{json_url}'
38
+
39
+ response = requests.get(url, headers=headers)
40
+ response.raise_for_status()
41
+ data = response.json()
42
+
43
+ if len(data) > 1:
44
+ comments_data = data[1]['data']['children']
45
+
46
+ # Filter out non-comment entries and extract scores
47
+ valid_comments = [
48
+ comment for comment in comments_data
49
+ if comment['kind'] == 't1' and comment['data'].get('score') is not None
50
+ ]
51
+
52
+ # Sort comments by score (upvotes) in descending order
53
+ sorted_comments = sorted(
54
+ valid_comments,
55
+ key=lambda x: x['data'].get('score', 0),
56
+ reverse=True
57
+ )
58
+
59
+ # Take only the top N comments
60
+ top_comments = sorted_comments[:num_comments]
61
+
62
+ # Print comment scores for verification
63
+ print("\nTop comment scores for this post:")
64
+ for i, comment in enumerate(top_comments, 1):
65
+ score = comment['data'].get('score', 0)
66
+ print(f"Comment {i}: {score} upvotes")
67
+
68
+ # Add to main list
69
+ for comment in top_comments:
70
+ all_comments.append(extract_comment_data(comment, post))
71
+
72
+ time.sleep(20)
73
+
74
+ except requests.exceptions.RequestException as e:
75
+ print(f"Error fetching comments for post {idx + 1}: {e}")
76
+ continue
77
+
78
+ # Create DataFrame and sort
79
+ comments_df = pd.DataFrame(all_comments)
80
+
81
+ if not comments_df.empty:
82
+ # Verify sorting by showing top comments for each post
83
+ print("\nVerification of comment sorting:")
84
+ for post_title in comments_df['post_title'].unique():
85
+ post_comments = comments_df[comments_df['post_title'] == post_title]
86
+ print(f"\nPost: {post_title[:100]}...")
87
+ print("Comment scores:", post_comments['comment_score'].tolist())
88
+
89
+ return comments_df
90
+
91
+
92
+ def fetch_subreddits(limit=10, min_subscribers=1000):
93
+ headers = {
94
+ 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
95
+ }
96
+ subreddits_data = []
97
+ after = None
98
+
99
+ while len(subreddits_data) < limit:
100
+ try:
101
+ url = f'https://www.reddit.com/subreddits/popular.json?limit=100'
102
+ if after:
103
+ url += f'&after={after}'
104
+
105
+ print(f"Fetching subreddits... Current count: {len(subreddits_data)}")
106
+ response = requests.get(url, headers=headers)
107
+ response.raise_for_status()
108
+ data = response.json()
109
+
110
+ for subreddit in data['data']['children']:
111
+ subreddit_data = subreddit['data']
112
+
113
+ if subreddit_data.get('subscribers', 0) >= min_subscribers:
114
+ sub_info = {
115
+ 'display_name': subreddit_data.get('display_name'),
116
+ 'display_name_prefixed': subreddit_data.get('display_name_prefixed'),
117
+ 'title': subreddit_data.get('title'),
118
+ 'subscribers': subreddit_data.get('subscribers', 0),
119
+ 'active_users': subreddit_data.get('active_user_count', 0),
120
+ 'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)),
121
+ 'description': subreddit_data.get('description'),
122
+ 'subreddit_type': subreddit_data.get('subreddit_type'),
123
+ 'over18': subreddit_data.get('over18', False),
124
+ 'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/"
125
+ }
126
+ subreddits_data.append(sub_info)
127
+
128
+ after = data['data'].get('after')
129
+ if not after:
130
+ print("Reached end of listings")
131
+ break
132
+
133
+ time.sleep(2)
134
+
135
+ except requests.exceptions.RequestException as e:
136
+ print(f"Error fetching data: {e}")
137
+ break
138
+
139
+ return pd.DataFrame(subreddits_data)
140
+
141
+ def fetch_top_posts(subreddit, limit=5):
142
+ posts_data = []
143
+ url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}'
144
+ headers = {
145
+ 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
146
+ }
147
+
148
+ try:
149
+ response = requests.get(url, headers=headers)
150
+ response.raise_for_status()
151
+ data = response.json()
152
+
153
+ for post in data['data']['children']:
154
+ post_data = post['data']
155
+ posts_data.append({
156
+ 'subreddit': subreddit,
157
+ 'title': post_data.get('title'),
158
+ 'score': post_data.get('score'),
159
+ 'num_comments': post_data.get('num_comments'),
160
+ 'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)),
161
+ 'url': post_data.get('url'),
162
+ 'permalink': 'https://www.reddit.com' + post_data.get('permalink', '')
163
+ })
164
+
165
+ time.sleep(2)
166
+
167
+ except requests.exceptions.RequestException as e:
168
+ print(f"Error fetching posts from r/{subreddit}: {e}")
169
+
170
+ return pd.DataFrame(posts_data)
171
+
172
+ def main():
173
+ # Step 1: Fetch Subreddits
174
+ print("Fetching subreddits...")
175
+ subreddits_df = fetch_subreddits(limit=10, min_subscribers=1000)
176
+ print(f"Fetched {len(subreddits_df)} subreddits.")
177
+ subreddits_df.to_csv("subreddits.csv")
178
+
179
+ # # Step 2: Fetch Top Posts for each subreddit
180
+ all_posts_data = []
181
+ for subreddit in subreddits_df['display_name']:
182
+ print(f"\nFetching top posts for subreddit: {subreddit}...")
183
+ posts_df = fetch_top_posts(subreddit, limit=5)
184
+ all_posts_data.append(posts_df)
185
+
186
+ # Combine all posts into a single DataFrame
187
+ posts_df = pd.concat(all_posts_data, ignore_index=True)
188
+ print(f"Fetched {len(posts_df)} top posts.")
189
+ posts_df.to_csv("posts.csv")
190
+
191
+ posts_df = pd.read_csv("posts.csv")
192
+
193
+ # Step 3: Fetch Top Comments for each post
194
+ all_comments_data = []
195
+ if not posts_df.empty:
196
+ all_comments_data = fetch_top_comments(posts_df, num_comments=2)
197
+ print(f"Fetched {len(all_comments_data)} top comments.")
198
+ all_comments_data.to_csv("comments.csv")
199
+
200
+ if __name__ == "__main__":
201
+ main()
subreddits.csv ADDED
The diff for this file is too large to render. See raw diff