storresbusquets commited on
Commit
86a552a
1 Parent(s): 14cf752

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -23
app.py CHANGED
@@ -28,7 +28,11 @@ class GradioInference:
28
  self.yt = None
29
 
30
  # Initialize summary model for English
31
- self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn", truncation=True)
 
 
 
 
32
 
33
  # Initialize VoiceLabT5 model and tokenizer
34
  self.keyword_model = T5ForConditionalGeneration.from_pretrained(
@@ -41,9 +45,6 @@ class GradioInference:
41
  # Sentiment Classifier
42
  self.classifier = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", return_all_scores=False)
43
 
44
- # Initialize Multilingual summary model
45
- self.tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum", truncation=True)
46
- self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
47
 
48
  def __call__(self, link, lang, size, progress=gr.Progress()):
49
  """
@@ -57,6 +58,7 @@ class GradioInference:
57
  - WordCloud: using the wordcloud python library.
58
  """
59
  progress(0, desc="Starting analysis")
 
60
  if self.yt is None:
61
  self.yt = YouTube(link)
62
 
@@ -78,14 +80,18 @@ class GradioInference:
78
  progress(0.40, desc="Summarizing")
79
 
80
  # Perform summarization on the transcription
81
- transcription_summary = self.summarizer(
82
- results["text"], max_length=256, min_length=30, do_sample=False, truncation=True
 
 
 
 
83
  )
84
 
85
- #### Resumen multilingue
86
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
87
 
88
- input_ids_sum = self.tokenizer(
89
  [WHITESPACE_HANDLER(results["text"])],
90
  return_tensors="pt",
91
  padding="max_length",
@@ -93,14 +99,14 @@ class GradioInference:
93
  max_length=512
94
  )["input_ids"]
95
 
96
- output_ids_sum = self.model.generate(
97
  input_ids=input_ids_sum,
98
- max_length=130,
99
  no_repeat_ngram_size=2,
100
  num_beams=4
101
  )[0]
102
 
103
- summary = self.tokenizer.decode(
104
  output_ids_sum,
105
  skip_special_tokens=True,
106
  clean_up_tokenization_spaces=False
@@ -112,12 +118,19 @@ class GradioInference:
112
  # Extract keywords using VoiceLabT5
113
  task_prefix = "Keywords: "
114
  input_sequence = task_prefix + results["text"]
 
115
  input_ids = self.keyword_tokenizer(
116
- input_sequence, return_tensors="pt", truncation=False
 
 
117
  ).input_ids
 
118
  output = self.keyword_model.generate(
119
- input_ids, no_repeat_ngram_size=3, num_beams=4
 
 
120
  )
 
121
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
122
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
123
  formatted_keywords = "\n".join([f"• {keyword}" for keyword in keywords])
@@ -201,14 +214,14 @@ class GradioInference:
201
  progress(0.40, desc="Summarizing")
202
 
203
  # Perform summarization on the transcription
204
- transcription_summary = self.summarizer(
205
  results["text"], max_length=150, min_length=30, do_sample=False, truncation=True
206
  )
207
 
208
  #### Resumen multilingue
209
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
210
 
211
- input_ids_sum = self.tokenizer(
212
  [WHITESPACE_HANDLER(results["text"])],
213
  return_tensors="pt",
214
  padding="max_length",
@@ -216,14 +229,14 @@ class GradioInference:
216
  max_length=512
217
  )["input_ids"]
218
 
219
- output_ids_sum = self.model.generate(
220
  input_ids=input_ids_sum,
221
  max_length=130,
222
  no_repeat_ngram_size=2,
223
  num_beams=4
224
  )[0]
225
 
226
- summary = self.tokenizer.decode(
227
  output_ids_sum,
228
  skip_special_tokens=True,
229
  clean_up_tokenization_spaces=False
@@ -235,11 +248,17 @@ class GradioInference:
235
  # Extract keywords using VoiceLabT5
236
  task_prefix = "Keywords: "
237
  input_sequence = task_prefix + results["text"]
 
238
  input_ids = self.keyword_tokenizer(
239
- input_sequence, return_tensors="pt", truncation=False
 
 
240
  ).input_ids
 
241
  output = self.keyword_model.generate(
242
- input_ids, no_repeat_ngram_size=3, num_beams=4
 
 
243
  )
244
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
245
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
@@ -267,10 +286,9 @@ class GradioInference:
267
  )
268
  wordcloud_image = wordcloud.to_image()
269
 
270
- if lang == "english":
271
  return (
272
  results["text"],
273
- # summ,
274
  transcription_summary[0]["summary_text"],
275
  formatted_keywords,
276
  formatted_sentiment,
@@ -279,7 +297,6 @@ class GradioInference:
279
  else:
280
  return (
281
  results["text"],
282
- # summ,
283
  summary,
284
  formatted_keywords,
285
  formatted_sentiment,
@@ -306,7 +323,7 @@ with block as demo:
306
  </div>
307
  """
308
  )
309
- with gr.Group():
310
  with gr.Tab("From YouTube 📹"):
311
  with gr.Box():
312
 
 
28
  self.yt = None
29
 
30
  # Initialize summary model for English
31
+ self.bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn", truncation=True)
32
+
33
+ # Initialize Multilingual summary model
34
+ self.mt5_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum", truncation=True)
35
+ self.mt5_model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
36
 
37
  # Initialize VoiceLabT5 model and tokenizer
38
  self.keyword_model = T5ForConditionalGeneration.from_pretrained(
 
45
  # Sentiment Classifier
46
  self.classifier = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", return_all_scores=False)
47
 
 
 
 
48
 
49
  def __call__(self, link, lang, size, progress=gr.Progress()):
50
  """
 
58
  - WordCloud: using the wordcloud python library.
59
  """
60
  progress(0, desc="Starting analysis")
61
+
62
  if self.yt is None:
63
  self.yt = YouTube(link)
64
 
 
80
  progress(0.40, desc="Summarizing")
81
 
82
  # Perform summarization on the transcription
83
+ transcription_summary = self.bart_summarizer(
84
+ results["text"],
85
+ max_length=256,
86
+ min_length=30,
87
+ do_sample=False,
88
+ truncation=True
89
  )
90
 
91
+ #### Resumen multilingue con mt5
92
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
93
 
94
+ input_ids_sum = self.mt5_tokenizer(
95
  [WHITESPACE_HANDLER(results["text"])],
96
  return_tensors="pt",
97
  padding="max_length",
 
99
  max_length=512
100
  )["input_ids"]
101
 
102
+ output_ids_sum = self.mt5_model.generate(
103
  input_ids=input_ids_sum,
104
+ max_length=256,
105
  no_repeat_ngram_size=2,
106
  num_beams=4
107
  )[0]
108
 
109
+ summary = self.mt5_tokenizer.decode(
110
  output_ids_sum,
111
  skip_special_tokens=True,
112
  clean_up_tokenization_spaces=False
 
118
  # Extract keywords using VoiceLabT5
119
  task_prefix = "Keywords: "
120
  input_sequence = task_prefix + results["text"]
121
+
122
  input_ids = self.keyword_tokenizer(
123
+ input_sequence,
124
+ return_tensors="pt",
125
+ truncation=False
126
  ).input_ids
127
+
128
  output = self.keyword_model.generate(
129
+ input_ids,
130
+ no_repeat_ngram_size=3,
131
+ num_beams=4
132
  )
133
+
134
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
135
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
136
  formatted_keywords = "\n".join([f"• {keyword}" for keyword in keywords])
 
214
  progress(0.40, desc="Summarizing")
215
 
216
  # Perform summarization on the transcription
217
+ transcription_summary = self.bart_summarizer(
218
  results["text"], max_length=150, min_length=30, do_sample=False, truncation=True
219
  )
220
 
221
  #### Resumen multilingue
222
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
223
 
224
+ input_ids_sum = self.mt5_tokenizer(
225
  [WHITESPACE_HANDLER(results["text"])],
226
  return_tensors="pt",
227
  padding="max_length",
 
229
  max_length=512
230
  )["input_ids"]
231
 
232
+ output_ids_sum = self.mt5_model.generate(
233
  input_ids=input_ids_sum,
234
  max_length=130,
235
  no_repeat_ngram_size=2,
236
  num_beams=4
237
  )[0]
238
 
239
+ summary = self.mt5_tokenizer.decode(
240
  output_ids_sum,
241
  skip_special_tokens=True,
242
  clean_up_tokenization_spaces=False
 
248
  # Extract keywords using VoiceLabT5
249
  task_prefix = "Keywords: "
250
  input_sequence = task_prefix + results["text"]
251
+
252
  input_ids = self.keyword_tokenizer(
253
+ input_sequence,
254
+ return_tensors="pt",
255
+ truncation=False
256
  ).input_ids
257
+
258
  output = self.keyword_model.generate(
259
+ input_ids,
260
+ no_repeat_ngram_size=3,
261
+ num_beams=4
262
  )
263
  predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
264
  keywords = [x.strip() for x in predicted.split(",") if x.strip()]
 
286
  )
287
  wordcloud_image = wordcloud.to_image()
288
 
289
+ if lang == "english" or lang == "none":
290
  return (
291
  results["text"],
 
292
  transcription_summary[0]["summary_text"],
293
  formatted_keywords,
294
  formatted_sentiment,
 
297
  else:
298
  return (
299
  results["text"],
 
300
  summary,
301
  formatted_keywords,
302
  formatted_sentiment,
 
323
  </div>
324
  """
325
  )
326
+ with gr.Group(spacing_size="md", radius_size="md"):
327
  with gr.Tab("From YouTube 📹"):
328
  with gr.Box():
329