Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,134 +1,86 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
from
|
4 |
-
|
5 |
-
|
6 |
-
Wav2Vec2Processor,
|
7 |
-
AutoTokenizer,
|
8 |
-
AutoModelWithLMHead
|
9 |
-
)
|
10 |
-
import torch
|
11 |
-
import re
|
12 |
-
import sys
|
13 |
-
import soundfile as sf
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
|
18 |
model_name = "voidful/wav2vec2-xlsr-multilingual-56"
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
def
|
32 |
-
|
33 |
-
|
34 |
-
if
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
for logit in logits:
|
53 |
-
pred_ids = torch.argmax(logit, dim=-1)
|
54 |
-
mask = pred_ids.ge(1).unsqueeze(-1).expand(logit.size())
|
55 |
-
vocab_size = logit.size()[-1]
|
56 |
-
voice_prob = torch.nn.functional.softmax((torch.masked_select(logit, mask).view(-1,vocab_size)),dim=-1)
|
57 |
-
comb_pred_ids = torch.argmax(voice_prob, dim=-1)
|
58 |
-
decoded_results.append(processor.decode(comb_pred_ids))
|
59 |
-
|
60 |
-
return decoded_results
|
61 |
-
|
62 |
-
def predict_lang_specific(data,lang_code):
|
63 |
-
data=load_file_to_data(data,sampling_rate=16_000)
|
64 |
-
features = processor(data["speech"], sampling_rate=data["sampling_rate"], padding=True, return_tensors="pt")
|
65 |
-
input_values = features.input_values.to(device)
|
66 |
-
attention_mask = features.attention_mask.to(device)
|
67 |
-
with torch.no_grad():
|
68 |
-
logits = model(input_values, attention_mask=attention_mask).logits
|
69 |
-
decoded_results = []
|
70 |
-
for logit in logits:
|
71 |
-
pred_ids = torch.argmax(logit, dim=-1)
|
72 |
-
mask = ~pred_ids.eq(processor.tokenizer.pad_token_id).unsqueeze(-1).expand(logit.size())
|
73 |
-
vocab_size = logit.size()[-1]
|
74 |
-
voice_prob = torch.nn.functional.softmax((torch.masked_select(logit, mask).view(-1,vocab_size)),dim=-1)
|
75 |
-
filtered_input = pred_ids[pred_ids!=processor.tokenizer.pad_token_id].view(1,-1).to(device)
|
76 |
-
if len(filtered_input[0]) == 0:
|
77 |
-
decoded_results.append("")
|
78 |
-
else:
|
79 |
-
lang_mask = torch.empty(voice_prob.shape[-1]).fill_(0)
|
80 |
-
lang_index = torch.tensor(sorted(lang_ids[lang_code]))
|
81 |
-
lang_mask.index_fill_(0, lang_index, 1)
|
82 |
-
lang_mask = lang_mask.to(device)
|
83 |
-
comb_pred_ids = torch.argmax(lang_mask*voice_prob, dim=-1)
|
84 |
-
decoded_results.append(processor.decode(comb_pred_ids))
|
85 |
-
|
86 |
-
return decoded_results
|
87 |
-
|
88 |
-
'''def recognition(audio_file):
|
89 |
-
print("audio_file", audio_file.name)
|
90 |
-
speech, rate = sp.load_speech_with_file(audio_file.name)
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
#predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate
|
98 |
|
99 |
#predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
|
100 |
with gr.Blocks() as demo:
|
101 |
gr.Markdown("multilingual Speech Recognition")
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
108 |
gr.Markdown("set your speech language")
|
109 |
inputs_speech1 =[
|
110 |
gr.Audio(source="upload", type="filepath"),
|
111 |
-
gr.Dropdown(choices=["ar","
|
112 |
-
,value="fa",label="language code")
|
113 |
]
|
114 |
output_transcribe1 = gr.Textbox(label="output")
|
115 |
transcribe_audio1= gr.Button("Submit")
|
116 |
-
'''with gr.Tab("Auto1"):
|
117 |
-
gr.Markdown("automatically detects your language")
|
118 |
-
inputs_speech2 = gr.Audio(label="Input Audio", type="file")
|
119 |
-
output_transcribe2 = gr.Textbox()
|
120 |
-
transcribe_audio2= gr.Button("Submit")'''
|
121 |
-
transcribe_audio.click(fn=predict,
|
122 |
-
inputs=inputs_speech,
|
123 |
-
outputs=output_transcribe)
|
124 |
|
125 |
-
|
|
|
|
|
|
|
|
|
126 |
inputs=inputs_speech1 ,
|
127 |
outputs=output_transcribe1 )
|
128 |
-
|
129 |
-
'''transcribe_audio2.click(fn=recognition,
|
130 |
-
inputs=inputs_speech2 ,
|
131 |
-
outputs=output_transcribe2 )'''
|
132 |
|
133 |
|
134 |
if __name__ == "__main__":
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import pipeline
|
3 |
+
from pydub import AudioSegment
|
4 |
+
import os
|
5 |
+
import speech_recognition as sr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
model_name = "voidful/wav2vec2-xlsr-multilingual-56"
|
8 |
+
model0 = pipeline(task="automatic-speech-recognition",
|
9 |
+
model=model_name)
|
10 |
+
|
11 |
+
|
12 |
+
model_name = "SLPL/Sharif-wav2vec2"
|
13 |
+
model = pipeline(task="automatic-speech-recognition",
|
14 |
+
model=model_name)
|
15 |
+
model_name = "ghofrani/common8"
|
16 |
+
model1 = pipeline(task="automatic-speech-recognition",
|
17 |
+
model=model_name)
|
18 |
+
|
19 |
+
import json
|
20 |
+
def predict_fa(speech,model):
|
21 |
+
if model== "SLPL/Sharif-wav2vec2":
|
22 |
+
text = model(speech,return_timestamps="word" )
|
23 |
+
else if model== "ghofrani/common8":
|
24 |
+
text = model1(speech,return_timestamps="word" )
|
25 |
+
else if model== "voidful/wav2vec2-xlsr-multilingual-56":
|
26 |
+
text = model0(speech,return_timestamps="word" )
|
27 |
+
return [text['text'],json.dumps(text)]
|
28 |
+
|
29 |
+
|
30 |
+
def convert_to_wav(filename):
|
31 |
+
filenameObj=os.path.splitext(filename)
|
32 |
+
audio = AudioSegment.from_file(filename,format=filenameObj[1].replace(".",""))
|
33 |
+
new_filename = filenameObj[0] + ".wav"
|
34 |
+
while os.path.exists(new_filename):
|
35 |
+
new_filename = os.path.splitext(new_filename)[0]+"(1)"+ ".wav"
|
36 |
+
audio.export(new_filename, format="wav")
|
37 |
+
print(f"Converting {filename} to {new_filename}...")
|
38 |
+
return new_filename
|
39 |
+
def g_rec(audio_File ,language):
|
40 |
+
r = sr.Recognizer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
if not os.path.splitext(audio_File)[1]==".wav":
|
43 |
+
audio_File=convert_to_wav(audio_File)
|
44 |
+
hellow=sr.AudioFile(audio_File)
|
45 |
+
with hellow as source:
|
46 |
+
audio = r.record(source)
|
47 |
+
try:
|
48 |
+
s = r.recognize_google(audio,language =language)
|
49 |
+
res= "Text: "+s
|
50 |
+
except Exception as e:
|
51 |
+
res= "Exception: "+str(e)
|
52 |
+
return res
|
53 |
+
# Export file as .wav
|
54 |
+
|
55 |
#predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate
|
56 |
|
57 |
#predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate
|
58 |
with gr.Blocks() as demo:
|
59 |
gr.Markdown("multilingual Speech Recognition")
|
60 |
+
|
61 |
+
with gr.Tab("Persian models"):
|
62 |
+
inputs_speech_fa =gr.Audio(source="upload", type="filepath", optional=True,label="Upload your audio:")
|
63 |
+
inputs_model_fa =gr.inputs.Radio(label="Language", choices=["ghofrani/common8","SLPL/Sharif-wav2vec2","voidful/wav2vec2-xlsr-multilingual-56"])
|
64 |
+
output_transcribe1_fa = gr.Textbox(label="Transcribed text:")
|
65 |
+
output_transcribe1_fa1 = gr.Textbox(label="Transcribed text with timestamps:")
|
66 |
+
transcribe_audio1_fa= gr.Button("Submit")
|
67 |
+
with gr.Tab("google"):
|
68 |
gr.Markdown("set your speech language")
|
69 |
inputs_speech1 =[
|
70 |
gr.Audio(source="upload", type="filepath"),
|
71 |
+
gr.Dropdown(choices=["af-ZA","am-ET","ar-AE","ar-BH","ar-DZ","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA","ar-MR","ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-YE","az-AZ","bg-BG","bn-BD","bn-IN","bs-BA","ca-ES","cs-CZ","da-DK","de-AT","de-CH","de-DE","el-GR","en-AU","en-CA","en-GB","en-GH","en-HK","en-IE","en-IN","en-KE","en-NG","en-NZ","en-PH","en-PK","en-SG","en-TZ","en-US","en-ZA","es-AR","es-BO","es-CL","es-CO","es-CR","es-DO","es-EC","es-ES","es-GT","es-HN","es-MX","es-NI","es-PA","es-PE","es-PR","es-PY","es-SV","es-US","es-UY","es-VE","et-EE","eu-ES","fa-IR","fi-FI","fil-PH","fr-BE","fr-CA","fr-CH","fr-FR","gl-ES","gu-IN","hi-IN","hr-HR","hu-HU","hy-AM","id-ID","is-IS","it-CH","it-IT","iw-IL","ja-JP","jv-ID","ka-GE","kk-KZ","km-KH","kn-IN","ko-KR","lo-LA","lt-LT","lv-LV","mk-MK","ml-IN","mn-MN","mr-IN","ms-MY","my-MM","ne-NP","nl-BE","nl-NL","no-NO","pa-Guru-IN","pl-PL","pt-BR","pt-PT","ro-RO","ru-RU","si-LK","sk-SK","sl-SI","sq-AL","sr-RS","su-ID","sv-SE","sw-KE","sw-TZ","ta-IN","ta-LK","ta-MY","ta-SG","te-IN","th-TH","tr-TR","uk-UA","ur-IN","ur-PK","uz-UZ","vi-VN","yue-Hant-HK","zh (cmn-Hans-CN)","zh-TW (cmn-Hant-TW)","zu-ZA"]
|
72 |
+
,value="fa-IR",label="language code")
|
73 |
]
|
74 |
output_transcribe1 = gr.Textbox(label="output")
|
75 |
transcribe_audio1= gr.Button("Submit")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
transcribe_audio1_fa.click(fn=predict_fa,
|
78 |
+
inputs=[inputs_speech_fa ,inputs_model_fa ],
|
79 |
+
outputs=[output_transcribe1_fa ,output_transcribe1_fa1 ] )
|
80 |
+
|
81 |
+
transcribe_audio1_go.click(fn=g_rec,
|
82 |
inputs=inputs_speech1 ,
|
83 |
outputs=output_transcribe1 )
|
|
|
|
|
|
|
|
|
84 |
|
85 |
|
86 |
if __name__ == "__main__":
|