Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,44 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
2 |
import whisper
|
3 |
from transformers import pipeline
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
model = whisper.load_model("base")
|
6 |
sentiment_analysis = pipeline("sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions")
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def analyze_sentiment(text):
|
9 |
results = sentiment_analysis(text)
|
10 |
sentiment_results = {result['label']: result['score'] for result in results}
|
@@ -54,7 +88,9 @@ def display_sentiment_results(sentiment_results, option):
|
|
54 |
sentiment_text += f"{sentiment} {emoji}: {score}\n"
|
55 |
return sentiment_text
|
56 |
|
57 |
-
def inference(audio, sentiment_option):
|
|
|
|
|
58 |
audio = whisper.load_audio(audio)
|
59 |
audio = whisper.pad_or_trim(audio)
|
60 |
|
@@ -66,15 +102,20 @@ def inference(audio, sentiment_option):
|
|
66 |
options = whisper.DecodingOptions(fp16=False)
|
67 |
result = whisper.decode(model, mel, options)
|
68 |
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
-
return lang.upper(), result.text, sentiment_output
|
73 |
|
74 |
-
title = """<h1 align="center"
|
75 |
image_path = "thmbnail.jpg"
|
76 |
description = """
|
77 |
-
π» This demo showcases a
|
78 |
<br>
|
79 |
βοΈ Components of the tool:<br>
|
80 |
<br>
|
@@ -121,34 +162,32 @@ with block:
|
|
121 |
with gr.Column():
|
122 |
gr.HTML(description)
|
123 |
|
124 |
-
with gr.
|
125 |
with gr.Row():
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
</div>
|
152 |
-
''')
|
153 |
|
154 |
block.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import cv2
|
3 |
+
import easyocr
|
4 |
+
import numpy as np
|
5 |
+
import requests
|
6 |
+
import os
|
7 |
import whisper
|
8 |
from transformers import pipeline
|
9 |
|
10 |
+
API_KEY = os.getenv("API_KEY")
|
11 |
+
|
12 |
+
API_URL = "https://api-inference.huggingface.co/models/dima806/facial_emotions_image_detection"
|
13 |
+
headers = {"Authorization": "Bearer "+ API_KEY+""}
|
14 |
+
|
15 |
+
reader = easyocr.Reader(['en'], gpu=False)
|
16 |
+
|
17 |
model = whisper.load_model("base")
|
18 |
sentiment_analysis = pipeline("sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions")
|
19 |
|
20 |
+
def query(image):
|
21 |
+
image_data = np.array(image, dtype=np.uint8)
|
22 |
+
_, buffer = cv2.imencode('.jpg', image_data)
|
23 |
+
binary_data = buffer.tobytes()
|
24 |
+
|
25 |
+
response = requests.post(API_URL, headers=headers, data=binary_data)
|
26 |
+
return response.json()
|
27 |
+
|
28 |
+
def text_extraction(image):
|
29 |
+
global text_content
|
30 |
+
text_content = ''
|
31 |
+
facial_data = query(image)
|
32 |
+
text_ = reader.readtext(image)
|
33 |
+
threshold = 0.25
|
34 |
+
for t_, t in enumerate(text_):
|
35 |
+
bbox, text, score = t
|
36 |
+
text_content = text_content + ' ' + ' '.join(text)
|
37 |
+
if score > threshold:
|
38 |
+
cv2.rectangle(image, tuple(map(int, bbox[0])), tuple(map(int, bbox[2])), (0, 255, 0), 5)
|
39 |
+
|
40 |
+
return image, text_content, facial_data
|
41 |
+
|
42 |
def analyze_sentiment(text):
|
43 |
results = sentiment_analysis(text)
|
44 |
sentiment_results = {result['label']: result['score'] for result in results}
|
|
|
88 |
sentiment_text += f"{sentiment} {emoji}: {score}\n"
|
89 |
return sentiment_text
|
90 |
|
91 |
+
def inference(image, text, audio, sentiment_option):
|
92 |
+
extracted_image, extracted_text, extracted_facial_data = text_extraction(image)
|
93 |
+
|
94 |
audio = whisper.load_audio(audio)
|
95 |
audio = whisper.pad_or_trim(audio)
|
96 |
|
|
|
102 |
options = whisper.DecodingOptions(fp16=False)
|
103 |
result = whisper.decode(model, mel, options)
|
104 |
|
105 |
+
audio_sentiment_results = analyze_sentiment(result.text) # Ta - Text from audio
|
106 |
+
image_sentiment_results = analyze_sentiment(extracted_text) # Ti - Text from image
|
107 |
+
text_sentiment_results = analyze_sentiment(text) # T - User defined Text
|
108 |
+
|
109 |
+
audio_sentiment_output = display_sentiment_results(audio_sentiment_results, sentiment_option)
|
110 |
+
image_sentiment_output = display_sentiment_results(image_sentiment_results, sentiment_option)
|
111 |
+
text_sentiment_output = display_sentiment_results(text_sentiment_results, sentiment_option)
|
112 |
|
113 |
+
return extracted_image, extracted_facial_data, extracted_text, image_sentiment_output, text_sentiment_output, lang.upper(), result.text, sentiment_output
|
114 |
|
115 |
+
title = """<h1 align="center">Cross Model Machine Learning (Sentiment Analysis)</h1>"""
|
116 |
image_path = "thmbnail.jpg"
|
117 |
description = """
|
118 |
+
π» This demo showcases a Cross Model Machine Learning for Sentiment Analysis.<br><br>
|
119 |
<br>
|
120 |
βοΈ Components of the tool:<br>
|
121 |
<br>
|
|
|
162 |
with gr.Column():
|
163 |
gr.HTML(description)
|
164 |
|
165 |
+
with gr.Blocks():
|
166 |
with gr.Row():
|
167 |
+
with gr.Column():
|
168 |
+
image = gr.Image()
|
169 |
+
|
170 |
+
image_output = gr.Image()
|
171 |
+
text_output = gr.Textbox(label="Text Content")
|
172 |
+
text_sentiment = gr.Textbox(label="Text Sentiment")
|
173 |
+
facial_output = gr.JSON(label="Facial Data")
|
174 |
+
|
175 |
+
with gr.Text():
|
176 |
+
gr.Textbox(label="Text Content")
|
177 |
+
|
178 |
+
output_text_sentiment = gr.TextBox("Text Sentiment")
|
179 |
+
|
180 |
+
with gr.Column():
|
181 |
+
audio = gr.Audio(label="Input Audio", show_label=False, type="filepath")
|
182 |
+
sentiment_option = gr.Radio(choices=["Sentiment Only", "Sentiment + Score"], label="Select an option")
|
183 |
+
|
184 |
+
lang_str = gr.Textbox(label="Language")
|
185 |
+
text = gr.Textbox(label="Transcription")
|
186 |
+
sentiment_output = gr.Textbox(label="Sentiment Analysis Results")
|
187 |
+
|
188 |
+
|
189 |
+
btn = gr.Button("Transcribe")
|
190 |
+
|
191 |
+
btn.click(inference, inputs=[image, text, audio, sentiment_option], outputs=[image_output, facial_output, text_output, text_sentiment, output_text_sentiment, lang_str, text, sentiment_output])
|
|
|
|
|
192 |
|
193 |
block.launch()
|