shresht8 commited on
Commit
bfa43e3
·
verified ·
1 Parent(s): e0b4a17

update sentiment scores

Browse files
Files changed (1) hide show
  1. app.py +161 -125
app.py CHANGED
@@ -11,15 +11,27 @@ model_name = "tabularisai/multilingual-sentiment-analysis"
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
13
 
 
 
 
 
 
 
 
 
14
 
15
- def predict_sentiment(texts):
 
16
  """
17
- Predict sentiment for a list of texts
18
  """
19
  inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
20
  with torch.no_grad():
21
  outputs = model(**inputs)
 
22
  probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
 
 
23
  sentiment_map = {
24
  0: "Very Negative",
25
  1: "Negative",
@@ -27,7 +39,17 @@ def predict_sentiment(texts):
27
  3: "Positive",
28
  4: "Very Positive"
29
  }
30
- return [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()]
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
  def process_single_sheet(df, product_name):
@@ -38,23 +60,25 @@ def process_single_sheet(df, product_name):
38
  raise ValueError(f"'Reviews' column not found in sheet/file for {product_name}")
39
 
40
  reviews = df['Reviews'].fillna("")
41
- sentiments = predict_sentiment(reviews.tolist())
 
42
  df['Sentiment'] = sentiments
 
43
 
44
  # Calculate sentiment distribution
45
  sentiment_counts = pd.Series(sentiments).value_counts()
 
46
 
47
- return df, sentiment_counts
48
 
49
 
50
- def create_comparison_charts(sentiment_results):
51
  """
52
- Create investment-focused comparison charts for different products
53
  """
54
  # Prepare data for plotting
55
  plot_data = []
56
  for product, sentiment_counts in sentiment_results.items():
57
- # Convert to dictionary and get sum
58
  sentiment_dict = sentiment_counts.to_dict()
59
  total = sum(sentiment_dict.values())
60
 
@@ -69,8 +93,9 @@ def create_comparison_charts(sentiment_results):
69
 
70
  df = pd.DataFrame(plot_data)
71
 
72
- # Ensure all sentiment columns exist (in case some products don't have all sentiments)
73
- for sentiment in ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']:
 
74
  if sentiment not in df.columns:
75
  df[sentiment] = 0
76
 
@@ -83,28 +108,26 @@ def create_comparison_charts(sentiment_results):
83
  'Very Positive': 100
84
  }
85
 
86
- df['Sentiment Score'] = 0
87
- for product in df['Product']:
88
- score = 0
89
- for sentiment, weight in sentiment_weights.items():
90
- if sentiment in df.columns:
91
- score += (df.loc[df['Product'] == product, sentiment].iloc[0] * weight / 100)
92
- df.loc[df['Product'] == product, 'Sentiment Score'] = round(score, 2)
93
 
94
- # Create sentiment score chart
95
- score_fig = go.Figure()
96
- score_fig.add_trace(go.Bar(
97
- x=df['Product'],
98
- y=df['Sentiment Score'],
99
- text=df['Sentiment Score'].round(1),
100
- textposition='auto',
101
- marker_color='rgb(65, 105, 225)'
102
- ))
103
- score_fig.update_layout(
104
- title='Overall Sentiment Score by Product (0-100)',
105
- yaxis_title='Weighted Sentiment Score',
106
- yaxis_range=[0, 100],
107
- showlegend=False
108
  )
109
 
110
  # Calculate Positive-Negative Ratios
@@ -131,24 +154,87 @@ def create_comparison_charts(sentiment_results):
131
  yaxis_title='Percentage (%)'
132
  )
133
 
134
- # Create summary table with investment-relevant metrics
135
- summary_df = pd.DataFrame({
136
- 'Product': df['Product'],
137
- 'Total Reviews': df['Total Reviews'],
138
- 'Sentiment Score (0-100)': df['Sentiment Score'],
139
- 'Positive Ratio (%)': df['Positive Ratio'].round(2),
140
- 'Negative Ratio (%)': df['Negative Ratio'].round(2),
141
- 'Neutral Ratio (%)': df['Neutral'].round(2)
142
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- # Calculate Confidence Score (avoiding division by zero)
145
- summary_df['Confidence Score'] = ((summary_df['Positive Ratio (%)'] + summary_df['Negative Ratio (%)']) /
146
- summary_df['Neutral Ratio (%)'].replace(0, 0.001)).round(2)
 
 
 
 
 
 
147
 
148
- # Sort by Sentiment Score for easy comparison
149
- summary_df = summary_df.sort_values('Sentiment Score (0-100)', ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- return score_fig, ratio_fig, summary_df
152
 
153
 
154
  def process_file(file_obj):
@@ -158,109 +244,56 @@ def process_file(file_obj):
158
  try:
159
  file_path = file_obj.name
160
  sentiment_results = defaultdict(pd.Series)
 
161
  all_processed_dfs = {}
162
 
163
  if file_path.endswith('.csv'):
164
  df = pd.read_csv(file_path)
165
  product_name = "Product" # Default name for CSV
166
- processed_df, sentiment_counts = process_single_sheet(df, product_name)
167
  all_processed_dfs[product_name] = processed_df
168
  sentiment_results[product_name] = sentiment_counts
 
169
 
170
  elif file_path.endswith(('.xlsx', '.xls')):
171
  excel_file = pd.ExcelFile(file_path)
172
  for sheet_name in excel_file.sheet_names:
173
  df = pd.read_excel(file_path, sheet_name=sheet_name)
174
- processed_df, sentiment_counts = process_single_sheet(df, sheet_name)
175
  all_processed_dfs[sheet_name] = processed_df
176
  sentiment_results[sheet_name] = sentiment_counts
 
177
  else:
178
  raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")
179
 
180
- # Create visualizations
181
- distribution_plot, summary_table = create_comparison_charts(sentiment_results)
 
 
182
 
183
  # Save results
184
  output_path = "sentiment_analysis_results.xlsx"
185
  with pd.ExcelWriter(output_path) as writer:
186
  for sheet_name, df in all_processed_dfs.items():
187
  df.to_excel(writer, sheet_name=sheet_name, index=False)
188
- summary_table.to_excel(writer, sheet_name='Summary', index=False)
189
-
190
- return (
191
- distribution_plot,
192
- summary_table,
193
- output_path
194
- )
195
-
196
- except Exception as e:
197
- raise gr.Error(str(e))
198
-
199
-
200
- # Create Gradio interface
201
- # In the Gradio interface section
202
- def create_comparison_charts(sentiment_results):
203
- """
204
- Create simplified, investment-focused comparison charts
205
- """
206
- # Prepare data
207
- plot_data = []
208
- for product, sentiment_counts in sentiment_results.items():
209
- sentiment_dict = sentiment_counts.to_dict()
210
- total = sum(sentiment_dict.values())
211
-
212
- row = {
213
- 'Product': product,
214
- 'Total Reviews': total
215
- }
216
- for sentiment, count in sentiment_dict.items():
217
- row[sentiment] = (count / total) * 100
218
- plot_data.append(row)
219
-
220
- df = pd.DataFrame(plot_data)
221
-
222
- # Ensure all sentiment columns exist
223
- for sentiment in ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']:
224
- if sentiment not in df.columns:
225
- df[sentiment] = 0
226
 
227
- # 1. Simple Stacked Bar Chart showing sentiment distribution
228
- stack_fig = go.Figure()
229
- sentiments = ['Very Positive', 'Positive', 'Neutral', 'Negative', 'Very Negative']
230
- colors = ['rgb(39, 174, 96)', 'rgb(46, 204, 113)',
231
- 'rgb(241, 196, 15)', 'rgb(231, 76, 60)',
232
- 'rgb(192, 57, 43)']
233
-
234
- for sentiment, color in zip(sentiments, colors):
235
- stack_fig.add_trace(go.Bar(
236
- name=sentiment,
237
- x=df['Product'],
238
- y=df[sentiment],
239
- marker_color=color
240
- ))
241
-
242
- stack_fig.update_layout(
243
- barmode='stack',
244
- title='Sentiment Distribution by Product',
245
- yaxis_title='Percentage (%)'
246
- )
247
-
248
- # 2. Aggregated Sentiment Ratios for Quick Comparison
249
- df['Positive_Total'] = df[['Positive', 'Very Positive']].sum(axis=1)
250
- df['Negative_Total'] = df[['Negative', 'Very Negative']].sum(axis=1)
251
 
252
- summary_df = pd.DataFrame({
253
- 'Product': df['Product'],
254
- 'Total Reviews': df['Total Reviews'],
255
- 'Positive (%)': df['Positive_Total'].round(2),
256
- 'Neutral (%)': df['Neutral'].round(2),
257
- 'Negative (%)': df['Negative_Total'].round(2)
258
- })
259
 
260
- # Sort by Positive percentage for easy comparison
261
- summary_df = summary_df.sort_values('Positive (%)', ascending=False)
262
 
263
- return stack_fig, summary_df
 
264
 
265
 
266
  # Update the Gradio interface
@@ -289,6 +322,9 @@ with gr.Blocks() as interface:
289
  with gr.Row():
290
  analyze_btn = gr.Button("Analyze Sentiments")
291
 
 
 
 
292
  with gr.Row():
293
  distribution_plot = gr.Plot(label="Sentiment Distribution")
294
 
@@ -301,8 +337,8 @@ with gr.Blocks() as interface:
301
  analyze_btn.click(
302
  fn=process_file,
303
  inputs=[file_input],
304
- outputs=[distribution_plot, summary_table, output_file]
305
  )
306
 
307
- # launch interface
308
- interface.launch()
 
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
13
 
14
+ # Define sentiment weights for score calculation
15
+ SENTIMENT_WEIGHTS = {
16
+ 0: 0.0, # Very Negative
17
+ 1: 0.25, # Negative
18
+ 2: 0.5, # Neutral
19
+ 3: 0.75, # Positive
20
+ 4: 1.0 # Very Positive
21
+ }
22
 
23
+
24
+ def predict_sentiment_with_scores(texts):
25
  """
26
+ Predict sentiment for a list of texts and return both class labels and sentiment scores
27
  """
28
  inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
29
  with torch.no_grad():
30
  outputs = model(**inputs)
31
+
32
  probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
33
+
34
+ # Get predicted classes
35
  sentiment_map = {
36
  0: "Very Negative",
37
  1: "Negative",
 
39
  3: "Positive",
40
  4: "Very Positive"
41
  }
42
+ predicted_classes = [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()]
43
+
44
+ # Calculate sentiment scores (0-100)
45
+ sentiment_scores = []
46
+ for prob in probabilities:
47
+ # Weighted sum of probabilities
48
+ score = sum(prob[i].item() * SENTIMENT_WEIGHTS[i] for i in range(len(prob)))
49
+ # Scale to 0-100
50
+ sentiment_scores.append(round(score * 100, 2))
51
+
52
+ return predicted_classes, sentiment_scores
53
 
54
 
55
  def process_single_sheet(df, product_name):
 
60
  raise ValueError(f"'Reviews' column not found in sheet/file for {product_name}")
61
 
62
  reviews = df['Reviews'].fillna("")
63
+ sentiments, scores = predict_sentiment_with_scores(reviews.tolist())
64
+
65
  df['Sentiment'] = sentiments
66
+ df['Sentiment_Score'] = scores
67
 
68
  # Calculate sentiment distribution
69
  sentiment_counts = pd.Series(sentiments).value_counts()
70
+ avg_sentiment_score = round(sum(scores) / len(scores), 2)
71
 
72
+ return df, sentiment_counts, avg_sentiment_score
73
 
74
 
75
+ def create_comparison_charts(sentiment_results, avg_scores):
76
  """
77
+ Create investment-focused comparison charts including the new sentiment score visualization
78
  """
79
  # Prepare data for plotting
80
  plot_data = []
81
  for product, sentiment_counts in sentiment_results.items():
 
82
  sentiment_dict = sentiment_counts.to_dict()
83
  total = sum(sentiment_dict.values())
84
 
 
93
 
94
  df = pd.DataFrame(plot_data)
95
 
96
+ # Ensure all sentiment columns exist in the correct order
97
+ sentiments = ['Very Positive', 'Positive', 'Neutral', 'Negative', 'Very Negative']
98
+ for sentiment in sentiments:
99
  if sentiment not in df.columns:
100
  df[sentiment] = 0
101
 
 
108
  'Very Positive': 100
109
  }
110
 
111
+ # Create stacked bar chart for sentiment distribution
112
+ distribution_fig = go.Figure()
113
+ sentiments = ['Very Positive', 'Positive', 'Neutral', 'Negative', 'Very Negative']
114
+ colors = ['rgb(39, 174, 96)', 'rgb(46, 204, 113)',
115
+ 'rgb(241, 196, 15)', 'rgb(231, 76, 60)',
116
+ 'rgb(192, 57, 43)']
 
117
 
118
+ for sentiment, color in zip(sentiments, colors):
119
+ distribution_fig.add_trace(go.Bar(
120
+ name=sentiment,
121
+ x=df['Product'],
122
+ y=df[sentiment],
123
+ marker_color=color
124
+ ))
125
+
126
+ distribution_fig.update_layout(
127
+ barmode='stack',
128
+ title='Sentiment Distribution by Product',
129
+ yaxis_title='Percentage (%)',
130
+ showlegend=True
 
131
  )
132
 
133
  # Calculate Positive-Negative Ratios
 
154
  yaxis_title='Percentage (%)'
155
  )
156
 
157
+ # Create summary DataFrame
158
+ summary_data = {
159
+ 'Product': df['Product'].tolist(),
160
+ 'Total Reviews': df['Total Reviews'].tolist(),
161
+ 'Positive Ratio (%)': df['Positive Ratio'].round(2).tolist(),
162
+ 'Negative Ratio (%)': df['Negative Ratio'].round(2).tolist(),
163
+ 'Neutral Ratio (%)': df['Neutral'].round(2).tolist(),
164
+ 'Weighted Sentiment Score': [avg_scores[prod] for prod in df['Product']]
165
+ }
166
+ summary_df = pd.DataFrame(summary_data)
167
+
168
+ # Create sentiment score chart
169
+ score_comparison_fig = go.Figure()
170
+ score_comparison_fig.add_trace(go.Bar(
171
+ x=summary_df['Product'],
172
+ y=summary_df['Weighted Sentiment Score'],
173
+ text=[f"{score:.1f}" for score in summary_df['Weighted Sentiment Score']],
174
+ textposition='auto',
175
+ marker_color='rgb(65, 105, 225)',
176
+ name='Sentiment Score'
177
+ ))
178
+ score_comparison_fig.update_layout(
179
+ title='Weighted Sentiment Scores by Product (0-100)',
180
+ yaxis_title='Sentiment Score',
181
+ yaxis_range=[0, 100],
182
+ showlegend=False,
183
+ bargap=0.3,
184
+ plot_bgcolor='white'
185
+ )
186
+
187
+ return score_comparison_fig, distribution_fig, ratio_fig, summary_df
188
+
189
+ products = list(avg_scores.keys())
190
+ scores = list(avg_scores.values())
191
 
192
+ # Add bars for sentiment scores
193
+ score_comparison_fig.add_trace(go.Bar(
194
+ x=products,
195
+ y=scores,
196
+ text=[f"{score:.1f}" for score in scores],
197
+ textposition='auto',
198
+ marker_color='rgb(65, 105, 225)',
199
+ name='Sentiment Score'
200
+ ))
201
 
202
+ # Update layout with appropriate styling
203
+ score_comparison_fig.update_layout(
204
+ title='Weighted Sentiment Scores by Product (0-100)',
205
+ yaxis_title='Sentiment Score',
206
+ yaxis_range=[0, 100],
207
+ showlegend=False,
208
+ bargap=0.3,
209
+ plot_bgcolor='white'
210
+ )
211
+
212
+ # Add score to summary DataFrame
213
+ summary_df['Weighted Sentiment Score'] = [avg_scores[prod] for prod in summary_df['Product']]
214
+
215
+ # Create sentiment distribution stacked bar chart
216
+ distribution_fig = go.Figure()
217
+ colors = ['rgb(39, 174, 96)', 'rgb(46, 204, 113)',
218
+ 'rgb(241, 196, 15)', 'rgb(231, 76, 60)',
219
+ 'rgb(192, 57, 43)']
220
+
221
+ # Add traces for each sentiment in order
222
+ for sentiment, color in zip(sentiments, colors):
223
+ distribution_fig.add_trace(go.Bar(
224
+ name=sentiment,
225
+ x=df['Product'],
226
+ y=df[sentiment],
227
+ marker_color=color
228
+ ))
229
+
230
+ distribution_fig.update_layout(
231
+ barmode='stack',
232
+ title='Sentiment Distribution by Product',
233
+ yaxis_title='Percentage (%)',
234
+ showlegend=True
235
+ )
236
 
237
+ return score_comparison_fig, distribution_fig, summary_df, output_path
238
 
239
 
240
  def process_file(file_obj):
 
244
  try:
245
  file_path = file_obj.name
246
  sentiment_results = defaultdict(pd.Series)
247
+ avg_sentiment_scores = {}
248
  all_processed_dfs = {}
249
 
250
  if file_path.endswith('.csv'):
251
  df = pd.read_csv(file_path)
252
  product_name = "Product" # Default name for CSV
253
+ processed_df, sentiment_counts, avg_score = process_single_sheet(df, product_name)
254
  all_processed_dfs[product_name] = processed_df
255
  sentiment_results[product_name] = sentiment_counts
256
+ avg_sentiment_scores[product_name] = avg_score
257
 
258
  elif file_path.endswith(('.xlsx', '.xls')):
259
  excel_file = pd.ExcelFile(file_path)
260
  for sheet_name in excel_file.sheet_names:
261
  df = pd.read_excel(file_path, sheet_name=sheet_name)
262
+ processed_df, sentiment_counts, avg_score = process_single_sheet(df, sheet_name)
263
  all_processed_dfs[sheet_name] = processed_df
264
  sentiment_results[sheet_name] = sentiment_counts
265
+ avg_sentiment_scores[sheet_name] = avg_score
266
  else:
267
  raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")
268
 
269
+ # Create visualizations with new sentiment score chart
270
+ score_comparison_fig, distribution_fig, ratio_fig, summary_df = create_comparison_charts(
271
+ sentiment_results, avg_sentiment_scores
272
+ )
273
 
274
  # Save results
275
  output_path = "sentiment_analysis_results.xlsx"
276
  with pd.ExcelWriter(output_path) as writer:
277
  for sheet_name, df in all_processed_dfs.items():
278
  df.to_excel(writer, sheet_name=sheet_name, index=False)
279
+ if isinstance(summary_df, pd.DataFrame): # Safety check
280
+ summary_df.to_excel(writer, sheet_name='Summary', index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
+ # Save results
283
+ output_path = "sentiment_analysis_results.xlsx"
284
+ with pd.ExcelWriter(output_path) as writer:
285
+ # Save individual sheet data
286
+ for sheet_name, df in all_processed_dfs.items():
287
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ # Save summary data
290
+ if isinstance(summary_df, pd.DataFrame): # Ensure it's a DataFrame before saving
291
+ summary_df.to_excel(writer, sheet_name='Summary', index=False)
 
 
 
 
292
 
293
+ return score_comparison_fig, distribution_fig, summary_df, output_path
 
294
 
295
+ except Exception as e:
296
+ raise gr.Error(str(e))
297
 
298
 
299
  # Update the Gradio interface
 
322
  with gr.Row():
323
  analyze_btn = gr.Button("Analyze Sentiments")
324
 
325
+ with gr.Row():
326
+ sentiment_score_plot = gr.Plot(label="Weighted Sentiment Scores")
327
+
328
  with gr.Row():
329
  distribution_plot = gr.Plot(label="Sentiment Distribution")
330
 
 
337
  analyze_btn.click(
338
  fn=process_file,
339
  inputs=[file_input],
340
+ outputs=[sentiment_score_plot, distribution_plot, summary_table, output_file]
341
  )
342
 
343
+ # Launch interface
344
+ interface.launch()