Spaces:
Runtime error
Runtime error
import gradio as gr | |
from transformers import pipeline | |
from pydub import AudioSegment | |
import os | |
import speech_recognition as sr | |
html_seeker='''<style> | |
html, body { | |
margin: 0; | |
padding: 0; | |
min-width: 900px; | |
} | |
#header { | |
/*position: fixed;*/ | |
top: 0; | |
left: 0; | |
height: 50px; | |
min-width: 900px; | |
line-height: 50px; | |
width: 100%; | |
background-color: #999; | |
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5); | |
font-family: Helvetica, sans-serif; | |
} | |
#header, #header a { | |
color: white; | |
} | |
.home { | |
margin: 0; | |
font-weight: bold; | |
text-transform: lowercase; | |
width: 100px; | |
} | |
h4.home { | |
margin: 0; | |
background: #666; | |
padding-left: 25px; | |
padding-right: 30px; | |
margin-right: 20px; | |
float: left; | |
text-decoration: none; | |
} | |
.home:hover a { | |
background: #555; | |
} | |
#audio { | |
margin-top: 9px; | |
width: 500px; | |
display: inline-block; | |
} | |
#transcript { | |
margin: 0 15px; | |
margin-bottom: 5em; | |
white-space: pre-wrap; | |
line-height: 2em; | |
max-width: 600px; | |
color: #999; | |
clear: both; | |
margin-top: 75px; | |
/*direction: rtl;*/ | |
} | |
.success { | |
color: black; | |
} | |
.success:hover { | |
text-decoration: underline; | |
} | |
.active { | |
color: magenta; | |
background-color: yellow; | |
} | |
#preloader { | |
visibility: hidden; | |
} | |
</style><div id="header"> | |
<h4 class="home" onload="console.log('loaded00000000000000002')" >Model name</h4> | |
<audio id="audio" src="17.mp3" controls="true" ></audio> | |
</div> | |
</div> | |
<div id="transcript" dir="auto"></div> | |
<img src="" onload="alert('test'); | |
var oldScript = document.querySelector('script#huihiuh6'); | |
var newScript = document.createElement('script'); | |
Array.from(oldScript.attributes) | |
.forEach( attr => newScript.setAttribute(attr.name, attr.value) ); | |
newScript.appendChild(document.createTextNode(oldScript.innerHTML)); | |
oldScript.parentNode.replaceChild(newScript, oldScript); | |
"> | |
<script id="huihiuh6"> | |
function myFunction543rr(){ | |
console.log('loaded00000000000000002'); | |
} | |
var $a = document.getElementById("audio"); | |
$a.src=document.querySelector('audio').src; | |
console.log($a); | |
window.onkeydown = function(ev) { | |
if(ev.keyCode == 32) { | |
ev.preventDefault(); | |
$a.pause(); | |
} | |
} | |
var $trans = document.getElementById("transcript"); | |
var wds = []; | |
var cur_wd; | |
function highlight_word() { | |
var t = $a.currentTime; | |
// XXX: O(N); use binary search | |
var hits = wds.filter(function(x) { | |
return (t - x['timestamp']['0']) > 0.01 && (x['timestamp']['1'] - t) > 0.01; | |
}, wds); | |
var next_wd = hits[hits.length - 1]; | |
if(cur_wd != next_wd) { | |
var active = document.querySelectorAll('.active'); | |
for(var i = 0; i < active.length; i++) { | |
active[i].classList.remove('active'); | |
} | |
if(next_wd && next_wd.$div) { | |
next_wd.$div.classList.add('active'); | |
//render_phones(next_wd); | |
} | |
} | |
cur_wd = next_wd; | |
//highlight_phone(t); | |
window.requestAnimationFrame(highlight_word); | |
} | |
window.requestAnimationFrame(highlight_word); | |
$trans.innerHTML = "Loading..."; | |
function render(ret) { | |
wds = ret['chunks'] || []; | |
transcript = ret['text']; | |
$trans.innerHTML = ''; | |
var currentOffset = 0; | |
wds.forEach(function(wd) { | |
var $wd = document.createElement('span'); | |
var txt = wd['text']; | |
var $wdText = document.createTextNode(txt); | |
$wd.appendChild($wdText); | |
wd.$div = $wd; | |
$wd.className = 'success'; | |
$wd.onclick = function() { | |
console.log(wd['timestamp']['0']); | |
$a.currentTime = wd['timestamp']['0']; | |
$a.play(); | |
}; | |
$trans.appendChild($wd); | |
$trans.appendChild(document.createTextNode(' ')); | |
}); | |
} | |
function update() { | |
if(INLINE_JSON) { | |
// We want this to work from file:/// domains, so we provide a | |
// mechanism for inlining the alignment data. | |
render(INLINE_JSON); | |
} | |
} | |
var INLINE_JSON=''' | |
html_seeker2='''; | |
update(); | |
</script>''' | |
''' | |
model_name = "voidful/wav2vec2-xlsr-multilingual-56" | |
model0 = pipeline(task="automatic-speech-recognition", | |
model=model_name) | |
model_name = "SLPL/Sharif-wav2vec2" | |
model2 = pipeline(task="automatic-speech-recognition", | |
model=model_name) | |
model_name = "ghofrani/common8" | |
model1 = pipeline(task="automatic-speech-recognition", | |
model=model_name) | |
''' | |
import json | |
def predict_fa(speech,model): | |
'''if model== "SLPL/Sharif-wav2vec2": | |
text = model2(speech,return_timestamps="word" ) | |
elif model== "ghofrani/common8": | |
text = model1(speech,return_timestamps="word" ) | |
elif model== "voidful/wav2vec2-xlsr-multilingual-56": | |
text = model0(speech,return_timestamps="word" ) | |
''' | |
text={"text": "\u0627\u06cc\u0646\u0627\u0646 \u06a9\u0631\u0627\u0644\u0627\u0644 \u0648 \u06a9\u0648\u0631\u0646\u062f \u0648 \u0644\u0632\u0627 \u0627\u0632 \u06af\u0645\u0631\u0627\u0647\u06cc \u0628\u0647 \u0631\u0627\u0647 \u0628\u0627\u0632 \u0646\u0645\u06cc\u06a9\u0631\u062f\u0646\u062f", "chunks": [{"text": "\u0627\u06cc\u0646\u0627\u0646", "timestamp": [0.0, 0.72]}, {"text": "\u06a9\u0631\u0627\u0644\u0627\u0644", "timestamp": [0.92, 1.6]}, {"text": "\u0648", "timestamp": [1.72, 1.74]}, {"text": "\u06a9\u0648\u0631\u0646\u062f", "timestamp": [1.9, 2.54]}, {"text": "\u0648", "timestamp": [2.76, 2.78]}, {"text": "\u0644\u0632\u0627", "timestamp": [2.88, 3.16]}, {"text": "\u0627\u0632", "timestamp": [3.4, 3.5]}, {"text": "\u06af\u0645\u0631\u0627\u0647\u06cc", "timestamp": [3.64, 4.3]}, {"text": "\u0628\u0647", "timestamp": [4.6, 4.68]}, {"text": "\u0631\u0627\u0647", "timestamp": [4.78, 5.12]}, {"text": "\u0628\u0627\u0632", "timestamp": [5.3, 5.58]}, {"text": "\u0646\u0645\u06cc\u06a9\u0631\u062f\u0646\u062f", "timestamp": [5.68, 7.14]}]} | |
return [text['text'],json.dumps(text),html_seeker+json.dumps(text)+html_seeker2] | |
def convert_to_wav(filename): | |
filenameObj=os.path.splitext(filename) | |
audio = AudioSegment.from_file(filename,format=filenameObj[1].replace(".","")) | |
new_filename = filenameObj[0] + ".wav" | |
while os.path.exists(new_filename): | |
new_filename = os.path.splitext(new_filename)[0]+"(1)"+ ".wav" | |
audio.export(new_filename, format="wav") | |
print(f"Converting {filename} to {new_filename}...") | |
return new_filename | |
def g_rec(audio_File ,language): | |
r = sr.Recognizer() | |
print(audio_File) | |
#if not os.path.splitext(audio_File)[1]==".wav": | |
# audio_File=convert_to_wav(audio_File) | |
hellow=sr.AudioFile(audio_File) | |
with hellow as source: | |
audio = r.record(source) | |
try: | |
s = r.recognize_google(audio,language =language) | |
res= "Text: "+s | |
except Exception as e: | |
res= "Exception: "+str(e) | |
return res | |
# Export file as .wav | |
#predict(load_file_to_data('audio file path',sampling_rate=16_000)) # beware of the audio file sampling rate | |
#predict_lang_specific(load_file_to_data('audio file path',sampling_rate=16_000),'en') # beware of the audio file sampling rate | |
with gr.Blocks() as demo: | |
gr.Markdown("multilingual Speech Recognition") | |
with gr.Tab("Persian models"): | |
inputs_speech_fa =gr.Audio(source="upload", type="filepath", optional=True,label="Upload your audio:") | |
inputs_model_fa =gr.inputs.Radio(label="Language", choices=["ghofrani/common8","SLPL/Sharif-wav2vec2","voidful/wav2vec2-xlsr-multilingual-56"]) | |
output_transcribe1_fa = gr.Textbox(label="Transcribed text:") | |
output_transcribe1_fa1 = gr.Textbox(label="Transcribed text with timestamps:") | |
output_transcribe1_fa2 =gr.HTML(label="") | |
transcribe_audio1_fa= gr.Button("Submit") | |
with gr.Tab("google"): | |
gr.Markdown("set your speech language") | |
inputs_speech1 =[ | |
gr.Audio(source="upload", type="filepath"), | |
gr.Dropdown(choices=["af-ZA","am-ET","ar-AE","ar-BH","ar-DZ","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA","ar-MR","ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-YE","az-AZ","bg-BG","bn-BD","bn-IN","bs-BA","ca-ES","cs-CZ","da-DK","de-AT","de-CH","de-DE","el-GR","en-AU","en-CA","en-GB","en-GH","en-HK","en-IE","en-IN","en-KE","en-NG","en-NZ","en-PH","en-PK","en-SG","en-TZ","en-US","en-ZA","es-AR","es-BO","es-CL","es-CO","es-CR","es-DO","es-EC","es-ES","es-GT","es-HN","es-MX","es-NI","es-PA","es-PE","es-PR","es-PY","es-SV","es-US","es-UY","es-VE","et-EE","eu-ES","fa-IR","fi-FI","fil-PH","fr-BE","fr-CA","fr-CH","fr-FR","gl-ES","gu-IN","hi-IN","hr-HR","hu-HU","hy-AM","id-ID","is-IS","it-CH","it-IT","iw-IL","ja-JP","jv-ID","ka-GE","kk-KZ","km-KH","kn-IN","ko-KR","lo-LA","lt-LT","lv-LV","mk-MK","ml-IN","mn-MN","mr-IN","ms-MY","my-MM","ne-NP","nl-BE","nl-NL","no-NO","pa-Guru-IN","pl-PL","pt-BR","pt-PT","ro-RO","ru-RU","si-LK","sk-SK","sl-SI","sq-AL","sr-RS","su-ID","sv-SE","sw-KE","sw-TZ","ta-IN","ta-LK","ta-MY","ta-SG","te-IN","th-TH","tr-TR","uk-UA","ur-IN","ur-PK","uz-UZ","vi-VN","yue-Hant-HK","zh (cmn-Hans-CN)","zh-TW (cmn-Hant-TW)","zu-ZA"] | |
,value="fa-IR",label="language code") | |
] | |
output_transcribe1 = gr.Textbox(label="output") | |
transcribe_audio1_go= gr.Button("Submit") | |
transcribe_audio1_fa.click(fn=predict_fa, | |
inputs=[inputs_speech_fa ,inputs_model_fa ], | |
outputs=[output_transcribe1_fa ,output_transcribe1_fa1,output_transcribe1_fa2 ] ) | |
transcribe_audio1_go.click(fn=g_rec, | |
inputs=inputs_speech1 , | |
outputs=output_transcribe1 ) | |
if __name__ == "__main__": | |
demo.launch() | |