codenamewei commited on
Commit
d675859
1 Parent(s): bbe5135

Input with mic and file

Browse files
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2Processor
3
+ from transformers import AutoModelForCTC
4
+ from conversationalnlp.models.wav2vec2 import Wav2Vec2Predict
5
+ from conversationalnlp.models.wav2vec2 import ModelLoader
6
+ from conversationalnlp.utils import *
7
+ import soundfile as sf
8
+ import os
9
+
10
+ """
11
+ run gradio with
12
+ >>python app.py
13
+ """
14
+
15
+ audiosavepath = os.getcwd()
16
+
17
+ pretrained_model = "codenamewei/speech-to-text"
18
+
19
+ processor = Wav2Vec2Processor.from_pretrained(
20
+ pretrained_model)
21
+
22
+ model = AutoModelForCTC.from_pretrained(
23
+ pretrained_model)
24
+
25
+ modelloader = ModelLoader(model, processor)
26
+
27
+ predictor = Wav2Vec2Predict(modelloader)
28
+
29
+ examples = ["example1.flac", "example2.flac", "example3.flac"]
30
+
31
+
32
+ def greet(audioarray):
33
+ """
34
+ audio array in the following format
35
+
36
+ (16000, array([ -5277184, 326400, -120320, ..., -5970432, -12745216,
37
+ -6934528], dtype=int32))
38
+ <class 'tuple'>
39
+ """
40
+ audioabspath = os.path.join(audiosavepath, "temp.wav")
41
+
42
+ # WORKAROUND: Save to file and reread to get the array shape needed for prediction
43
+ sf.write(audioabspath, audioarray[1], audioarray[0])
44
+
45
+ print(f"Audio at path {audioabspath}")
46
+ predictiontexts = predictor.predictfiles([audioabspath])
47
+ outputtext = predictiontexts["predicted_text"][-1] + \
48
+ "\n" + predictiontexts["corrected_text"][-1]
49
+
50
+ return outputtext
51
+
52
+
53
+ demo = gr.Interface(fn=greet, inputs="audio",
54
+ outputs="text",
55
+ title="Speech-to-Text",
56
+ examples=examples)
57
+
58
+ demo.launch() # share=True)
app.py CHANGED
@@ -8,11 +8,13 @@ import soundfile as sf
8
  import os
9
 
10
  """
11
- run gradio with
12
  >>python app.py
13
  """
14
 
15
- audiosavepath = os.getcwd()
 
 
16
 
17
  pretrained_model = "codenamewei/speech-to-text"
18
 
@@ -26,33 +28,46 @@ modelloader = ModelLoader(model, processor)
26
 
27
  predictor = Wav2Vec2Predict(modelloader)
28
 
29
- examples = ["example1.flac", "example2.flac", "example3.flac"]
30
 
 
31
 
32
- def greet(audioarray):
33
- """
34
- audio array in the following format
35
 
36
- (16000, array([ -5277184, 326400, -120320, ..., -5970432, -12745216,
37
- -6934528], dtype=int32))
38
- <class 'tuple'>
39
  """
40
- audioabspath = os.path.join(audiosavepath, "temp.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # WORKAROUND: Save to file and reread to get the array shape needed for prediction
43
- sf.write(audioabspath, audioarray[1], audioarray[0])
44
 
45
- print(f"Audio at path {audioabspath}")
46
- predictiontexts = predictor.predictfiles([audioabspath])
47
- outputtext = predictiontexts["predicted_text"][-1] + \
48
- "\n" + predictiontexts["corrected_text"][-1]
49
 
50
- return outputtext
51
 
52
 
53
- demo = gr.Interface(fn=greet, inputs="audio",
54
- outputs="text",
 
55
  title="Speech-to-Text",
56
- examples=examples)
57
 
58
  demo.launch() # share=True)
 
8
  import os
9
 
10
  """
11
+ run gradio with
12
  >>python app.py
13
  """
14
 
15
+ audioheaderpath = os.path.join(
16
+ os.getcwd(), "temp")
17
+
18
 
19
  pretrained_model = "codenamewei/speech-to-text"
20
 
 
28
 
29
  predictor = Wav2Vec2Predict(modelloader)
30
 
31
+ audiofileexamples = ["example1.flac", "example2.flac"]
32
 
33
+ fileextension = ".wav"
34
 
 
 
 
35
 
36
+ def greet(*args):
 
 
37
  """
38
+ List[tuple, tuple]
39
+ mic: param[0] (int, np.array)
40
+ audiofile: param[1] (int, np.array)
41
+ """
42
+
43
+ dictinput = dict(mic=args[0], file=args[1])
44
+ audiofiles = []
45
+
46
+ for key, audioarray in dictinput.items():
47
+
48
+ if audioarray is not None:
49
+ # WORKAROUND: Save to file and reread to get the array shape needed for prediction
50
+
51
+ audioabspath = audioheaderpath + "_" + key + fileextension
52
+ print(f"Audio at path {audioabspath}")
53
+ sf.write(audioabspath,
54
+ audioarray[1], audioarray[0])
55
+ audiofiles.append(audioabspath)
56
 
57
+ predictiontexts = predictor.predictfiles(audiofiles)
 
58
 
59
+ mictext = predictiontexts["predicted_text"][0] + "\n" + \
60
+ predictiontexts["corrected_text"][0] if dictinput['mic'] is not None else ""
61
+ filetext = predictiontexts["predicted_text"][-1] + "\n" + \
62
+ predictiontexts["corrected_text"][-1] if dictinput['file'] is not None else ""
63
 
64
+ return [mictext, filetext]
65
 
66
 
67
+ demo = gr.Interface(fn=greet,
68
+ inputs=["mic", "audio"],
69
+ outputs=["text", "text"],
70
  title="Speech-to-Text",
71
+ examples=[audiofileexamples])
72
 
73
  demo.launch() # share=True)
example3.flac DELETED
Binary file (239 kB)
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  gradio==3.0.5
2
- conversationalnlp==0.0.3post4
3
  transformers==4.20.1
4
  SoundFile==0.10.3post1
 
1
  gradio==3.0.5
2
+ conversationalnlp==0.0.4
3
  transformers==4.20.1
4
  SoundFile==0.10.3post1
temp_file.wav ADDED
Binary file (93.2 kB). View file
 
temp_mic.wav ADDED
Binary file (300 kB). View file