gpt-99 commited on
Commit
613e758
Β·
1 Parent(s): e40a948

initial commit

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +372 -0
  3. requirements.txt +81 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Reddit Search
3
- emoji: πŸ“‰
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: gradio
 
1
  ---
2
  title: Reddit Search
3
+ emoji: πŸ”
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ import time
4
+ from datetime import datetime
5
+ from dotenv import load_dotenv
6
+ import os
7
+ import gradio as gr
8
+
9
+ load_dotenv()
10
+
11
+ XAI_API_KEY = os.getenv("XAI_API_KEY")
12
+
13
+ # Global variable to store the most recent analysis results
14
+ GLOBAL_ANALYSIS_STORAGE = {
15
+ 'subreddit': None,
16
+ 'data': None
17
+ }
18
+
19
+ def call_LLM(query):
20
+ return call_groq(query)
21
+
22
+ def call_groq(query):
23
+ from groq import Groq
24
+ client = Groq()
25
+ chat_completion = client.chat.completions.create(
26
+ messages=[
27
+ {"role": "system", "content": query}
28
+ ],
29
+ model="llama3-8b-8192",
30
+ temperature=0.5,
31
+ max_tokens=1024,
32
+ top_p=1,
33
+ stop=None,
34
+ stream=False,
35
+ )
36
+
37
+ return chat_completion.choices[0].message.content
38
+
39
+ def process(row):
40
+ """
41
+ Format this so that the model sees full post for now
42
+ """
43
+ # title
44
+ # comment_body
45
+ prompt = f"The below is a reddit post. Take a look and tell me if there is a business problem to be solved here ||| title: {row['post_title']} ||| comment: {row['comment_body']}"
46
+ return call_LLM(prompt)
47
+
48
+ # ... [Keep previous helper functions like extract_comment_data, fetch_top_comments, fetch_subreddits, fetch_top_posts] ...
49
+
50
+ def extract_comment_data(comment, post_info):
51
+ """Extract relevant data from a comment"""
52
+ return {
53
+ 'subreddit': post_info['subreddit'],
54
+ 'post_title': post_info['title'],
55
+ 'post_score': post_info['score'],
56
+ 'post_created_utc': post_info['created_utc'],
57
+ 'comment_id': comment['data'].get('id'),
58
+ 'comment_author': comment['data'].get('author'),
59
+ 'comment_body': comment['data'].get('body'),
60
+ 'comment_score': comment['data'].get('score', 0),
61
+ 'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)),
62
+ 'post_url': post_info['url'],
63
+ 'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}",
64
+ }
65
+
66
+ def fetch_top_comments(post_df, num_comments=2):
67
+ """
68
+ Fetch top comments for each post in the dataframe, sorted by upvotes
69
+ """
70
+ all_comments = []
71
+ total_posts = len(post_df)
72
+ headers = {
73
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
74
+ }
75
+
76
+ print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...")
77
+
78
+ for idx, post in post_df.iterrows():
79
+ print(f"\nProcessing post {idx + 1}/{total_posts}")
80
+ print(f"Title: {post['title'][:100]}...")
81
+ print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}")
82
+
83
+ try:
84
+ json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json'
85
+ url = f'https://www.reddit.com{json_url}'
86
+
87
+ response = requests.get(url, headers=headers)
88
+ response.raise_for_status()
89
+ data = response.json()
90
+
91
+ if len(data) > 1:
92
+ comments_data = data[1]['data']['children']
93
+
94
+ # Filter out non-comment entries and extract scores
95
+ valid_comments = [
96
+ comment for comment in comments_data
97
+ if comment['kind'] == 't1' and comment['data'].get('score') is not None
98
+ ]
99
+
100
+ # Sort comments by score (upvotes) in descending order
101
+ sorted_comments = sorted(
102
+ valid_comments,
103
+ key=lambda x: x['data'].get('score', 0),
104
+ reverse=True
105
+ )
106
+
107
+ # Take only the top N comments
108
+ top_comments = sorted_comments[:num_comments]
109
+
110
+ # Print comment scores for verification
111
+ print("\nTop comment scores for this post:")
112
+ for i, comment in enumerate(top_comments, 1):
113
+ score = comment['data'].get('score', 0)
114
+ print(f"Comment {i}: {score} upvotes")
115
+
116
+ # Add to main list
117
+ for comment in top_comments:
118
+ all_comments.append(extract_comment_data(comment, post))
119
+
120
+ time.sleep(2)
121
+
122
+ except requests.exceptions.RequestException as e:
123
+ print(f"Error fetching comments for post {idx + 1}: {e}")
124
+ continue
125
+
126
+ # Create DataFrame and sort
127
+ comments_df = pd.DataFrame(all_comments)
128
+
129
+ if not comments_df.empty:
130
+ # Verify sorting by showing top comments for each post
131
+ print("\nVerification of comment sorting:")
132
+ for post_title in comments_df['post_title'].unique():
133
+ post_comments = comments_df[comments_df['post_title'] == post_title]
134
+ print(f"\nPost: {post_title[:100]}...")
135
+ print("Comment scores:", post_comments['comment_score'].tolist())
136
+
137
+ return comments_df
138
+
139
+
140
+ def fetch_subreddits(limit=10, min_subscribers=1000):
141
+ """
142
+ Fetch subreddits from Reddit
143
+
144
+ Args:
145
+ limit (int): Number of subreddits to fetch
146
+ min_subscribers (int): Minimum number of subscribers required
147
+ """
148
+ headers = {
149
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
150
+ }
151
+ subreddits_data = []
152
+ after = None
153
+
154
+ while len(subreddits_data) < limit:
155
+ try:
156
+ url = f'https://www.reddit.com/subreddits/popular.json?limit=100'
157
+ if after:
158
+ url += f'&after={after}'
159
+
160
+ print(f"Fetching subreddits... Current count: {len(subreddits_data)}")
161
+ response = requests.get(url, headers=headers)
162
+ response.raise_for_status()
163
+ data = response.json()
164
+
165
+ for subreddit in data['data']['children']:
166
+ subreddit_data = subreddit['data']
167
+
168
+ if subreddit_data.get('subscribers', 0) >= min_subscribers:
169
+ sub_info = {
170
+ 'display_name': subreddit_data.get('display_name'),
171
+ 'display_name_prefixed': subreddit_data.get('display_name_prefixed'),
172
+ 'title': subreddit_data.get('title'),
173
+ 'subscribers': subreddit_data.get('subscribers', 0),
174
+ 'active_users': subreddit_data.get('active_user_count', 0),
175
+ 'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)),
176
+ 'description': subreddit_data.get('description'),
177
+ 'subreddit_type': subreddit_data.get('subreddit_type'),
178
+ 'over18': subreddit_data.get('over18', False),
179
+ 'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/"
180
+ }
181
+ subreddits_data.append(sub_info)
182
+
183
+ after = data['data'].get('after')
184
+ if not after:
185
+ print("Reached end of listings")
186
+ break
187
+
188
+ time.sleep(2)
189
+
190
+ except requests.exceptions.RequestException as e:
191
+ print(f"Error fetching data: {e}")
192
+ break
193
+
194
+ return pd.DataFrame(subreddits_data)
195
+
196
+ def fetch_top_posts(subreddit, limit=5):
197
+ """
198
+ Fetch top posts from a subreddit using Reddit's JSON API
199
+
200
+ Args:
201
+ subreddit (str): Name of the subreddit without the 'r/'
202
+ limit (int): Maximum number of posts to fetch
203
+
204
+ Returns:
205
+ list: List of post dictionaries
206
+ """
207
+ posts_data = []
208
+ url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}'
209
+ headers = {
210
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
211
+ }
212
+
213
+ try:
214
+ response = requests.get(url, headers=headers)
215
+ response.raise_for_status()
216
+ data = response.json()
217
+
218
+ for post in data['data']['children']:
219
+ post_data = post['data']
220
+ posts_data.append({
221
+ 'subreddit': subreddit,
222
+ 'title': post_data.get('title'),
223
+ 'score': post_data.get('score'),
224
+ 'num_comments': post_data.get('num_comments'),
225
+ 'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)),
226
+ 'url': post_data.get('url'),
227
+ 'permalink': 'https://www.reddit.com' + post_data.get('permalink', '')
228
+ })
229
+
230
+ time.sleep(2)
231
+
232
+ except requests.exceptions.RequestException as e:
233
+ print(f"Error fetching posts from r/{subreddit}: {e}")
234
+
235
+ return pd.DataFrame(posts_data)
236
+
237
+
238
+ def show_dataframe(subreddit):
239
+ # Fetch top posts
240
+ top_posts = fetch_top_posts(subreddit)
241
+
242
+ # Fetch top comments for these posts
243
+ data_to_analyze = fetch_top_comments(top_posts)
244
+
245
+ # Process and analyze each comment
246
+ responses = []
247
+ for _, row in data_to_analyze.iterrows():
248
+ print(f"{_} done")
249
+ responses.append(process(row))
250
+
251
+ # Add analysis to the dataframe
252
+ data_to_analyze['analysis'] = responses
253
+
254
+ # Store in global storage for quick access
255
+ GLOBAL_ANALYSIS_STORAGE['subreddit'] = subreddit
256
+ GLOBAL_ANALYSIS_STORAGE['data'] = data_to_analyze
257
+
258
+ return data_to_analyze
259
+
260
+ def launch_interface():
261
+ # Fetch list of subreddits for user to choose from
262
+ sub_reddits = fetch_subreddits()
263
+ subreddit_list = sub_reddits["display_name"].tolist()
264
+
265
+ # Create Gradio Blocks for more flexible interface
266
+ with gr.Blocks() as demo:
267
+ # Title and description
268
+ gr.Markdown("# Reddit Business Problem Analyzer")
269
+ gr.Markdown("Discover potential business opportunities from Reddit discussions")
270
+
271
+ # Subreddit selection
272
+ subreddit_dropdown = gr.Dropdown(
273
+ choices=subreddit_list,
274
+ label="Select Subreddit",
275
+ info="Choose a subreddit to analyze"
276
+ )
277
+
278
+ # Outputs
279
+ with gr.Row():
280
+ with gr.Column():
281
+ # Overall Analysis Section
282
+ gr.Markdown("## Overall Analysis")
283
+ # overall_analysis = gr.Textbox(
284
+ # label="Aggregated Business Insights",
285
+ # interactive=False,
286
+ # lines=5
287
+ # )
288
+
289
+ # Results Table
290
+ results_table = gr.Dataframe(
291
+ label="Analysis Results",
292
+ headers=["Index", "Post Title", "Comment", "Analysis"],
293
+ interactive=False
294
+ )
295
+
296
+ # Row Selection
297
+ row_index = gr.Number(
298
+ label="Select Row Index for Detailed View",
299
+ precision=0
300
+ )
301
+
302
+ with gr.Column():
303
+ # Detailed Post Analysis
304
+ gr.Markdown("## Detailed Post Analysis")
305
+ detailed_analysis = gr.Markdown(
306
+ label="Detailed Insights"
307
+ )
308
+
309
+ # Function to update posts when subreddit is selected
310
+ def update_posts(subreddit):
311
+ # Fetch and analyze data
312
+ data_to_analyze = show_dataframe(subreddit)
313
+
314
+ # Prepare table data
315
+ table_data = data_to_analyze[['post_title', 'comment_body', 'analysis']].reset_index()
316
+ table_data.columns = ['Index', 'Post Title', 'Comment', 'Analysis']
317
+
318
+ return table_data, None
319
+
320
+ # Function to show detailed analysis for a specific row
321
+ def show_row_details(row_index):
322
+ # Ensure we have data loaded
323
+ if GLOBAL_ANALYSIS_STORAGE['data'] is None:
324
+ return "Please select a subreddit first."
325
+
326
+ try:
327
+ # Convert to integer and subtract 1 (since index is 0-based)
328
+ row_index = int(row_index)
329
+
330
+ # Retrieve the specific row
331
+ row_data = GLOBAL_ANALYSIS_STORAGE['data'].loc[row_index]
332
+
333
+ # Format detailed view
334
+ detailed_view = f"""
335
+ ### Post Details
336
+ **Title:** {row_data.get('post_title', 'N/A')}
337
+
338
+ **Comment:** {row_data.get('comment_body', 'N/A')}
339
+
340
+ **Comment Score:** {row_data.get('comment_score', 'N/A')}
341
+
342
+ **Analysis:** {row_data.get('analysis', 'No analysis available')}
343
+
344
+ **Post URL:** {row_data.get('post_url', 'N/A')}
345
+
346
+ **Comment URL:** {row_data.get('comment_url', 'N/A')}
347
+ """
348
+
349
+ return detailed_view
350
+
351
+ except (KeyError, ValueError, TypeError) as e:
352
+ return f"Error retrieving row details: {str(e)}"
353
+
354
+ # Event Listeners
355
+ subreddit_dropdown.change(
356
+ fn=update_posts,
357
+ inputs=subreddit_dropdown,
358
+ outputs=[results_table, detailed_analysis]
359
+ )
360
+
361
+ row_index.change(
362
+ fn=show_row_details,
363
+ inputs=row_index,
364
+ outputs=detailed_analysis
365
+ )
366
+
367
+ return demo
368
+
369
+ # Launch the interface
370
+ if __name__ == "__main__":
371
+ interface = launch_interface()
372
+ interface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.6.2.post1
4
+ appnope==0.1.4
5
+ asttokens==2.4.1
6
+ audioop-lts==0.2.1
7
+ certifi==2024.8.30
8
+ charset-normalizer==3.4.0
9
+ click==8.1.7
10
+ comm==0.2.2
11
+ debugpy==1.8.7
12
+ decorator==5.1.1
13
+ distro==1.9.0
14
+ executing==2.1.0
15
+ fastapi==0.115.4
16
+ ffmpy==0.4.0
17
+ filelock==3.16.1
18
+ fsspec==2024.10.0
19
+ gradio==5.4.0
20
+ gradio_client==1.4.2
21
+ groq==0.13.0
22
+ h11==0.14.0
23
+ httpcore==1.0.6
24
+ httpx==0.27.2
25
+ huggingface-hub==0.26.2
26
+ idna==3.10
27
+ ipykernel==6.29.5
28
+ ipython==8.29.0
29
+ jedi==0.19.1
30
+ Jinja2==3.1.4
31
+ jupyter_client==8.6.3
32
+ jupyter_core==5.7.2
33
+ markdown-it-py==3.0.0
34
+ MarkupSafe==2.1.5
35
+ matplotlib-inline==0.1.7
36
+ mdurl==0.1.2
37
+ nest-asyncio==1.6.0
38
+ numpy==2.1.2
39
+ ollama==0.3.3
40
+ orjson==3.10.10
41
+ packaging==24.1
42
+ pandas==2.2.3
43
+ parso==0.8.4
44
+ pexpect==4.9.0
45
+ pillow==11.0.0
46
+ platformdirs==4.3.6
47
+ prompt_toolkit==3.0.48
48
+ psutil==6.1.0
49
+ ptyprocess==0.7.0
50
+ pure_eval==0.2.3
51
+ pydantic==2.9.2
52
+ pydantic_core==2.23.4
53
+ pydub==0.25.1
54
+ Pygments==2.18.0
55
+ python-dateutil==2.9.0.post0
56
+ python-dotenv==1.0.1
57
+ python-multipart==0.0.12
58
+ pytz==2024.2
59
+ PyYAML==6.0.2
60
+ pyzmq==26.2.0
61
+ requests==2.32.3
62
+ rich==13.9.4
63
+ ruff==0.7.2
64
+ safehttpx==0.1.1
65
+ semantic-version==2.10.0
66
+ shellingham==1.5.4
67
+ six==1.16.0
68
+ sniffio==1.3.1
69
+ stack-data==0.6.3
70
+ starlette==0.41.2
71
+ tomlkit==0.12.0
72
+ tornado==6.4.1
73
+ tqdm==4.66.6
74
+ traitlets==5.14.3
75
+ typer==0.12.5
76
+ typing_extensions==4.12.2
77
+ tzdata==2024.2
78
+ urllib3==2.2.3
79
+ uvicorn==0.32.0
80
+ wcwidth==0.2.13
81
+ websockets==12.0