Spaces:

Siddhant
/

Voice_Assistant_Demo

Running on L40S

App Files Files Community

“siddhu001” commited on 26 days ago

Commit

71fd664

1 Parent(s): 6ebbb2b

Add input text box

Browse files

Files changed (1) hide show

app.py +25 -1

app.py CHANGED Viewed

@@ -283,6 +283,7 @@ def start_warmup():
         except Exception:
             print("Removing " + opt + " from ASR options since it cannot be loaded.")
             ASR_options = ASR_options[:opt_count] + ASR_options[(opt_count + 1) :]
             if opt == ASR_name:
                 ASR_name = ASR_options[0]
     for opt_count in range(len(LLM_options)):
@@ -345,6 +346,7 @@ def transcribe(
     ASR_option: str,
     LLM_option: str,
     type_option: str,
 ):
     """
     Processes and transcribes an audio stream in real-time.
@@ -420,6 +422,15 @@ def transcribe(
         audio_output1 = None
     else:
         stream = np.concatenate((stream, y))
     (
         asr_output_str,
         text_str,
@@ -512,6 +523,13 @@ with gr.Blocks(
             (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
         """
         )
     with gr.Row():
         with gr.Column(scale=1):
             user_audio = gr.Audio(
@@ -519,6 +537,12 @@ with gr.Blocks(
                 streaming=True,
                 waveform_options=gr.WaveformOptions(sample_rate=16000),
             )
             with gr.Row():
                 type_radio = gr.Radio(
                     choices=["Cascaded", "E2E"],
@@ -686,7 +710,7 @@ with gr.Blocks(
     )
     user_audio.stream(
         transcribe,
-        inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio],
         outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
     ).then(
         lambda *args: callback.flag(list(args)), [user_audio], None, preprocess=False

         except Exception:
             print("Removing " + opt + " from ASR options since it cannot be loaded.")
             ASR_options = ASR_options[:opt_count] + ASR_options[(opt_count + 1) :]
+            opt_count -=1
             if opt == ASR_name:
                 ASR_name = ASR_options[0]
     for opt_count in range(len(LLM_options)):
     ASR_option: str,
     LLM_option: str,
     type_option: str,
+    input_text: str,
 ):
     """
     Processes and transcribes an audio stream in real-time.
         audio_output1 = None
     else:
         stream = np.concatenate((stream, y))
+    # import pdb;pdb.set_trace()
+    dialogue_model.chat.init_chat(
+        {
+            "role": "system",
+            "content": (
+                input_text
+            ),
+        }
+    )
     (
         asr_output_str,
         text_str,
             (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
         """
         )
+    default_instruct=(
+        "You are a helpful and friendly AI "
+        "assistant. "
+        "You are polite, respectful, and aim to "
+        "provide concise and complete responses of "
+        "less than 15 words."
+    )
     with gr.Row():
         with gr.Column(scale=1):
             user_audio = gr.Audio(
                 streaming=True,
                 waveform_options=gr.WaveformOptions(sample_rate=16000),
             )
+            input_text=gr.Textbox(
+                label="LLM prompt",
+                visible=True,
+                interactive=True,
+                value=default_instruct
+            )
             with gr.Row():
                 type_radio = gr.Radio(
                     choices=["Cascaded", "E2E"],
     )
     user_audio.stream(
         transcribe,
+        inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
         outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
     ).then(
         lambda *args: callback.flag(list(args)), [user_audio], None, preprocess=False