Spaces:

codenamewei
/

speech-to-text

Runtime error

App Files Files Community

codenamewei commited on Jul 4, 2022

Commit

d675859

•

1 Parent(s): bbe5135

Input with mic and file

Browse files

Files changed (6) hide show

.ipynb_checkpoints/app-checkpoint.py +58 -0
app.py +35 -20
example3.flac +0 -0
requirements.txt +1 -1
temp_file.wav +0 -0
temp_mic.wav +0 -0

.ipynb_checkpoints/app-checkpoint.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+from transformers import Wav2Vec2Processor
+from transformers import AutoModelForCTC
+from conversationalnlp.models.wav2vec2 import Wav2Vec2Predict
+from conversationalnlp.models.wav2vec2 import ModelLoader
+from conversationalnlp.utils import *
+import soundfile as sf
+import os
+"""
+run gradio with
+>>python app.py
+"""
+audiosavepath = os.getcwd()
+pretrained_model = "codenamewei/speech-to-text"
+processor = Wav2Vec2Processor.from_pretrained(
+    pretrained_model)
+model = AutoModelForCTC.from_pretrained(
+    pretrained_model)
+modelloader = ModelLoader(model, processor)
+predictor = Wav2Vec2Predict(modelloader)
+examples = ["example1.flac", "example2.flac", "example3.flac"]
+def greet(audioarray):
+    """
+    audio array in the following format
+    (16000, array([ -5277184,    326400,   -120320, ...,  -5970432, -12745216,
+        -6934528], dtype=int32))
+    <class 'tuple'>
+    """
+    audioabspath = os.path.join(audiosavepath, "temp.wav")
+    # WORKAROUND: Save to file and reread to get the array shape needed for prediction
+    sf.write(audioabspath, audioarray[1], audioarray[0])
+    print(f"Audio at path {audioabspath}")
+    predictiontexts = predictor.predictfiles([audioabspath])
+    outputtext = predictiontexts["predicted_text"][-1] + \
+        "\n" + predictiontexts["corrected_text"][-1]
+    return outputtext
+demo = gr.Interface(fn=greet, inputs="audio",
+                    outputs="text",
+                    title="Speech-to-Text",
+                    examples=examples)
+demo.launch()  # share=True)

app.py CHANGED Viewed

@@ -8,11 +8,13 @@ import soundfile as sf
 import os
 """
-run gradio with
 >>python app.py
 """
-audiosavepath = os.getcwd()
 pretrained_model = "codenamewei/speech-to-text"
@@ -26,33 +28,46 @@ modelloader = ModelLoader(model, processor)
 predictor = Wav2Vec2Predict(modelloader)
-examples = ["example1.flac", "example2.flac", "example3.flac"]
-def greet(audioarray):
-    """
-    audio array in the following format
-    (16000, array([ -5277184,    326400,   -120320, ...,  -5970432, -12745216,
-        -6934528], dtype=int32))
-    <class 'tuple'>
     """
-    audioabspath = os.path.join(audiosavepath, "temp.wav")
-    # WORKAROUND: Save to file and reread to get the array shape needed for prediction
-    sf.write(audioabspath, audioarray[1], audioarray[0])
-    print(f"Audio at path {audioabspath}")
-    predictiontexts = predictor.predictfiles([audioabspath])
-    outputtext = predictiontexts["predicted_text"][-1] + \
-        "\n" + predictiontexts["corrected_text"][-1]
-    return outputtext
-demo = gr.Interface(fn=greet, inputs="audio",
-                    outputs="text",
                     title="Speech-to-Text",
-                    examples=examples)
 demo.launch()  # share=True)

 import os
 """
+run gradio with
 >>python app.py
 """
+audioheaderpath = os.path.join(
+    os.getcwd(), "temp")
 pretrained_model = "codenamewei/speech-to-text"
 predictor = Wav2Vec2Predict(modelloader)
+audiofileexamples = ["example1.flac", "example2.flac"]
+fileextension = ".wav"
+def greet(*args):
     """
+    List[tuple, tuple]
+    mic: param[0] (int, np.array)
+    audiofile: param[1] (int, np.array)
+    """
+    dictinput = dict(mic=args[0], file=args[1])
+    audiofiles = []
+    for key, audioarray in dictinput.items():
+        if audioarray is not None:
+            # WORKAROUND: Save to file and reread to get the array shape needed for prediction
+            audioabspath = audioheaderpath + "_" + key + fileextension
+            print(f"Audio at path {audioabspath}")
+            sf.write(audioabspath,
+                     audioarray[1], audioarray[0])
+            audiofiles.append(audioabspath)
+    predictiontexts = predictor.predictfiles(audiofiles)
+    mictext = predictiontexts["predicted_text"][0] + "\n" + \
+        predictiontexts["corrected_text"][0] if dictinput['mic'] is not None else ""
+    filetext = predictiontexts["predicted_text"][-1] + "\n" + \
+        predictiontexts["corrected_text"][-1] if dictinput['file'] is not None else ""
+    return [mictext, filetext]
+demo = gr.Interface(fn=greet,
+                    inputs=["mic", "audio"],
+                    outputs=["text", "text"],
                     title="Speech-to-Text",
+                    examples=[audiofileexamples])
 demo.launch()  # share=True)

example3.flac DELETED Viewed

Binary file (239 kB)

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 gradio==3.0.5
-conversationalnlp==0.0.3post4
 transformers==4.20.1
 SoundFile==0.10.3post1

 gradio==3.0.5
+conversationalnlp==0.0.4
 transformers==4.20.1
 SoundFile==0.10.3post1

temp_file.wav ADDED Viewed

Binary file (93.2 kB). View file

temp_mic.wav ADDED Viewed

Binary file (300 kB). View file