Spaces:
Runtime error
Runtime error
codenamewei
commited on
Commit
•
d675859
1
Parent(s):
bbe5135
Input with mic and file
Browse files- .ipynb_checkpoints/app-checkpoint.py +58 -0
- app.py +35 -20
- example3.flac +0 -0
- requirements.txt +1 -1
- temp_file.wav +0 -0
- temp_mic.wav +0 -0
.ipynb_checkpoints/app-checkpoint.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import Wav2Vec2Processor
|
3 |
+
from transformers import AutoModelForCTC
|
4 |
+
from conversationalnlp.models.wav2vec2 import Wav2Vec2Predict
|
5 |
+
from conversationalnlp.models.wav2vec2 import ModelLoader
|
6 |
+
from conversationalnlp.utils import *
|
7 |
+
import soundfile as sf
|
8 |
+
import os
|
9 |
+
|
10 |
+
"""
|
11 |
+
run gradio with
|
12 |
+
>>python app.py
|
13 |
+
"""
|
14 |
+
|
15 |
+
audiosavepath = os.getcwd()
|
16 |
+
|
17 |
+
pretrained_model = "codenamewei/speech-to-text"
|
18 |
+
|
19 |
+
processor = Wav2Vec2Processor.from_pretrained(
|
20 |
+
pretrained_model)
|
21 |
+
|
22 |
+
model = AutoModelForCTC.from_pretrained(
|
23 |
+
pretrained_model)
|
24 |
+
|
25 |
+
modelloader = ModelLoader(model, processor)
|
26 |
+
|
27 |
+
predictor = Wav2Vec2Predict(modelloader)
|
28 |
+
|
29 |
+
examples = ["example1.flac", "example2.flac", "example3.flac"]
|
30 |
+
|
31 |
+
|
32 |
+
def greet(audioarray):
|
33 |
+
"""
|
34 |
+
audio array in the following format
|
35 |
+
|
36 |
+
(16000, array([ -5277184, 326400, -120320, ..., -5970432, -12745216,
|
37 |
+
-6934528], dtype=int32))
|
38 |
+
<class 'tuple'>
|
39 |
+
"""
|
40 |
+
audioabspath = os.path.join(audiosavepath, "temp.wav")
|
41 |
+
|
42 |
+
# WORKAROUND: Save to file and reread to get the array shape needed for prediction
|
43 |
+
sf.write(audioabspath, audioarray[1], audioarray[0])
|
44 |
+
|
45 |
+
print(f"Audio at path {audioabspath}")
|
46 |
+
predictiontexts = predictor.predictfiles([audioabspath])
|
47 |
+
outputtext = predictiontexts["predicted_text"][-1] + \
|
48 |
+
"\n" + predictiontexts["corrected_text"][-1]
|
49 |
+
|
50 |
+
return outputtext
|
51 |
+
|
52 |
+
|
53 |
+
demo = gr.Interface(fn=greet, inputs="audio",
|
54 |
+
outputs="text",
|
55 |
+
title="Speech-to-Text",
|
56 |
+
examples=examples)
|
57 |
+
|
58 |
+
demo.launch() # share=True)
|
app.py
CHANGED
@@ -8,11 +8,13 @@ import soundfile as sf
|
|
8 |
import os
|
9 |
|
10 |
"""
|
11 |
-
run gradio with
|
12 |
>>python app.py
|
13 |
"""
|
14 |
|
15 |
-
|
|
|
|
|
16 |
|
17 |
pretrained_model = "codenamewei/speech-to-text"
|
18 |
|
@@ -26,33 +28,46 @@ modelloader = ModelLoader(model, processor)
|
|
26 |
|
27 |
predictor = Wav2Vec2Predict(modelloader)
|
28 |
|
29 |
-
|
30 |
|
|
|
31 |
|
32 |
-
def greet(audioarray):
|
33 |
-
"""
|
34 |
-
audio array in the following format
|
35 |
|
36 |
-
|
37 |
-
-6934528], dtype=int32))
|
38 |
-
<class 'tuple'>
|
39 |
"""
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
|
43 |
-
sf.write(audioabspath, audioarray[1], audioarray[0])
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
|
50 |
-
return
|
51 |
|
52 |
|
53 |
-
demo = gr.Interface(fn=greet,
|
54 |
-
|
|
|
55 |
title="Speech-to-Text",
|
56 |
-
examples=
|
57 |
|
58 |
demo.launch() # share=True)
|
|
|
8 |
import os
|
9 |
|
10 |
"""
|
11 |
+
run gradio with
|
12 |
>>python app.py
|
13 |
"""
|
14 |
|
15 |
+
audioheaderpath = os.path.join(
|
16 |
+
os.getcwd(), "temp")
|
17 |
+
|
18 |
|
19 |
pretrained_model = "codenamewei/speech-to-text"
|
20 |
|
|
|
28 |
|
29 |
predictor = Wav2Vec2Predict(modelloader)
|
30 |
|
31 |
+
audiofileexamples = ["example1.flac", "example2.flac"]
|
32 |
|
33 |
+
fileextension = ".wav"
|
34 |
|
|
|
|
|
|
|
35 |
|
36 |
+
def greet(*args):
|
|
|
|
|
37 |
"""
|
38 |
+
List[tuple, tuple]
|
39 |
+
mic: param[0] (int, np.array)
|
40 |
+
audiofile: param[1] (int, np.array)
|
41 |
+
"""
|
42 |
+
|
43 |
+
dictinput = dict(mic=args[0], file=args[1])
|
44 |
+
audiofiles = []
|
45 |
+
|
46 |
+
for key, audioarray in dictinput.items():
|
47 |
+
|
48 |
+
if audioarray is not None:
|
49 |
+
# WORKAROUND: Save to file and reread to get the array shape needed for prediction
|
50 |
+
|
51 |
+
audioabspath = audioheaderpath + "_" + key + fileextension
|
52 |
+
print(f"Audio at path {audioabspath}")
|
53 |
+
sf.write(audioabspath,
|
54 |
+
audioarray[1], audioarray[0])
|
55 |
+
audiofiles.append(audioabspath)
|
56 |
|
57 |
+
predictiontexts = predictor.predictfiles(audiofiles)
|
|
|
58 |
|
59 |
+
mictext = predictiontexts["predicted_text"][0] + "\n" + \
|
60 |
+
predictiontexts["corrected_text"][0] if dictinput['mic'] is not None else ""
|
61 |
+
filetext = predictiontexts["predicted_text"][-1] + "\n" + \
|
62 |
+
predictiontexts["corrected_text"][-1] if dictinput['file'] is not None else ""
|
63 |
|
64 |
+
return [mictext, filetext]
|
65 |
|
66 |
|
67 |
+
demo = gr.Interface(fn=greet,
|
68 |
+
inputs=["mic", "audio"],
|
69 |
+
outputs=["text", "text"],
|
70 |
title="Speech-to-Text",
|
71 |
+
examples=[audiofileexamples])
|
72 |
|
73 |
demo.launch() # share=True)
|
example3.flac
DELETED
Binary file (239 kB)
|
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
gradio==3.0.5
|
2 |
-
conversationalnlp==0.0.
|
3 |
transformers==4.20.1
|
4 |
SoundFile==0.10.3post1
|
|
|
1 |
gradio==3.0.5
|
2 |
+
conversationalnlp==0.0.4
|
3 |
transformers==4.20.1
|
4 |
SoundFile==0.10.3post1
|
temp_file.wav
ADDED
Binary file (93.2 kB). View file
|
|
temp_mic.wav
ADDED
Binary file (300 kB). View file
|
|