Update app.py
Browse files
app.py
CHANGED
@@ -21,7 +21,7 @@ from transformers import pipeline
|
|
21 |
# Set an environment variable
|
22 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
23 |
|
24 |
-
|
25 |
SAMPLE_RATE = 16000 # Hz
|
26 |
MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
|
27 |
DESCRIPTION = '''
|
@@ -35,16 +35,10 @@ DESCRIPTION = '''
|
|
35 |
'''
|
36 |
PLACEHOLDER = """
|
37 |
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
|
38 |
-
<img src="
|
39 |
<p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
|
40 |
</div>
|
41 |
"""
|
42 |
-
# PLACEHOLDER = """
|
43 |
-
# <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
|
44 |
-
# <img src="https://i.ibb.co/S35q17Q/My-Alexa-Logo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
|
45 |
-
# <p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
|
46 |
-
# </div>
|
47 |
-
# """
|
48 |
|
49 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
50 |
|
@@ -73,6 +67,8 @@ terminators = [
|
|
73 |
### TTS model
|
74 |
pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
|
75 |
|
|
|
|
|
76 |
def convert_audio(audio_filepath, tmpdir, utt_id):
|
77 |
"""
|
78 |
Convert all files to monochannel 16 kHz wav files.
|
@@ -197,10 +193,15 @@ def voice_player(history):
|
|
197 |
Plays the generated response using the VITS-ljs model.
|
198 |
Returns the audio player with the generated response.
|
199 |
"""
|
200 |
-
_, text = history
|
201 |
voice = pipe(text)
|
202 |
-
voice = gr.Audio(value = (
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
204 |
return voice
|
205 |
|
206 |
|
@@ -226,7 +227,10 @@ with gr.Blocks(
|
|
226 |
"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
|
227 |
)
|
228 |
|
229 |
-
audio_file = gr.Audio(
|
|
|
|
|
|
|
230 |
|
231 |
|
232 |
with gr.Column():
|
@@ -238,7 +242,7 @@ with gr.Blocks(
|
|
238 |
variant="primary"
|
239 |
)
|
240 |
|
241 |
-
chat_input = gr.Textbox(
|
242 |
label="Transcribed text:",
|
243 |
interactive=False,
|
244 |
placeholder="Transcribed text will appear here.",
|
@@ -246,12 +250,12 @@ with gr.Blocks(
|
|
246 |
visible=True # set to True to see processing time of asr transcription
|
247 |
)
|
248 |
|
249 |
-
out_audio = gr.Audio(
|
250 |
value = None,
|
251 |
label="Response Voice Player",
|
252 |
show_label=True,
|
253 |
visible=True # set to True to see processing time of tts audio generation
|
254 |
-
)
|
255 |
|
256 |
chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot], api_name="add_message_in_chatbot")
|
257 |
bot_msg = chat_msg.then(bot, [chatbot, chat_input], chatbot, api_name="bot_response_in_chatbot")
|
|
|
21 |
# Set an environment variable
|
22 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
23 |
|
24 |
+
# Variables
|
25 |
SAMPLE_RATE = 16000 # Hz
|
26 |
MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
|
27 |
DESCRIPTION = '''
|
|
|
35 |
'''
|
36 |
PLACEHOLDER = """
|
37 |
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
|
38 |
+
<img src="https://i.ibb.co/S35q17Q/My-Alexa-Logo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
|
39 |
<p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
|
40 |
</div>
|
41 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
44 |
|
|
|
67 |
### TTS model
|
68 |
pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
|
69 |
|
70 |
+
|
71 |
+
|
72 |
def convert_audio(audio_filepath, tmpdir, utt_id):
|
73 |
"""
|
74 |
Convert all files to monochannel 16 kHz wav files.
|
|
|
193 |
Plays the generated response using the VITS-ljs model.
|
194 |
Returns the audio player with the generated response.
|
195 |
"""
|
196 |
+
_, text = history[-1]
|
197 |
voice = pipe(text)
|
198 |
+
voice = gr.Audio(value = (
|
199 |
+
voice["sampling_rate"],
|
200 |
+
voice["audio"].squeeze()),
|
201 |
+
type="numpy", autoplay=True,
|
202 |
+
label="MyAlexa Response",
|
203 |
+
show_label=True,
|
204 |
+
visible=True)
|
205 |
return voice
|
206 |
|
207 |
|
|
|
227 |
"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
|
228 |
)
|
229 |
|
230 |
+
audio_file = gr.Audio(
|
231 |
+
sources=["microphone", "upload"],
|
232 |
+
type="filepath"
|
233 |
+
)
|
234 |
|
235 |
|
236 |
with gr.Column():
|
|
|
242 |
variant="primary"
|
243 |
)
|
244 |
|
245 |
+
chat_input = gr.Textbox( # Shows the transcribed text
|
246 |
label="Transcribed text:",
|
247 |
interactive=False,
|
248 |
placeholder="Transcribed text will appear here.",
|
|
|
250 |
visible=True # set to True to see processing time of asr transcription
|
251 |
)
|
252 |
|
253 |
+
out_audio = gr.Audio( # Shows an audio player for the generated response
|
254 |
value = None,
|
255 |
label="Response Voice Player",
|
256 |
show_label=True,
|
257 |
visible=True # set to True to see processing time of tts audio generation
|
258 |
+
)
|
259 |
|
260 |
chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot], api_name="add_message_in_chatbot")
|
261 |
bot_msg = chat_msg.then(bot, [chatbot, chat_input], chatbot, api_name="bot_response_in_chatbot")
|