|
import os |
|
import gradio as gr |
|
import whisper |
|
import requests |
|
import tempfile |
|
from neon_tts_plugin_coqui import CoquiTTS |
|
|
|
|
|
|
|
|
|
|
|
model = whisper.load_model("base") |
|
model_med = whisper.load_model("medium") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom" |
|
HF_TOKEN = os.environ["HF_TOKEN"] |
|
headers = {"Authorization": f"Bearer {HF_TOKEN}"} |
|
|
|
|
|
|
|
|
|
|
|
LANGUAGES = list(CoquiTTS.langs.keys()) |
|
coquiTTS = CoquiTTS() |
|
print(f"Languages for Coqui are: {LANGUAGES}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def driver_fun(audio) : |
|
transcribe, translation, lang = whisper_stt(audio) |
|
|
|
|
|
|
|
text_generated = lang_model_response(transcribe, lang) |
|
text_generated_en = lang_model_response(translation, 'en') |
|
|
|
if lang in ['es', 'fr']: |
|
speech = tts(text_generated, lang) |
|
else: |
|
speech = tts(text_generated_en, 'en') |
|
return transcribe, translation, text_generated, text_generated_en, speech |
|
|
|
|
|
|
|
def whisper_stt(audio): |
|
print("Inside Whisper TTS") |
|
|
|
audio = whisper.load_audio(audio) |
|
audio = whisper.pad_or_trim(audio) |
|
|
|
|
|
mel = whisper.log_mel_spectrogram(audio).to(model.device) |
|
|
|
|
|
_, probs = model.detect_language(mel) |
|
lang = max(probs, key=probs.get) |
|
print(f"Detected language: {max(probs, key=probs.get)}") |
|
|
|
|
|
options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') |
|
options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') |
|
result_transc = whisper.decode(model_med, mel, options_transc) |
|
result_transl = whisper.decode(model_med, mel, options_transl) |
|
|
|
|
|
print(f"transcript is : {result_transc.text}") |
|
print(f"translation is : {result_transl.text}") |
|
|
|
return result_transc.text, result_transl.text, lang |
|
|
|
|
|
|
|
def lang_model_response(prompt, language): |
|
print(f"Inside lang_model_response - Prompt is :{prompt}") |
|
p_en = """Question: How are you doing today? |
|
Answer: I am doing good, thanks. |
|
Question: """ |
|
p_es = """Pregunta: Cómo estás hoy? |
|
Responder: Estoy bien, gracias. |
|
Pregunta: """ |
|
p_fr = """Question: Comment vas-tu aujourd'hui? |
|
Réponse: Je vais bien, merci. |
|
Question: """ |
|
|
|
if len(prompt) == 0: |
|
prompt = """Question: Can you help me please? |
|
Answer: Sure, I am here for you. |
|
Question: """ |
|
|
|
if language == 'en': |
|
prompt = p_en + prompt + "\n" + "Answer: " |
|
elif language == 'es': |
|
prompt = p_es + prompt + "\n" + "Responder: " |
|
elif language == 'fr': |
|
prompt = p_fr + prompt + "\n" + "Réponse: " |
|
|
|
json_ = {"inputs": prompt, |
|
"parameters": |
|
{ |
|
"top_p": 0.90, |
|
"max_new_tokens": 64, |
|
"temperature": 1.1, |
|
"return_full_text": False, |
|
"do_sample": True, |
|
}, |
|
"options": |
|
{"use_cache": True, |
|
"wait_for_model": True, |
|
},} |
|
response = requests.post(API_URL, headers=headers, json=json_) |
|
|
|
output = response.json() |
|
output_tmp = output[0]['generated_text'] |
|
print(f"Bloom API Response is : {output_tmp}") |
|
if language == 'en': |
|
solution = output_tmp.split("Answer: ")[2].split("\n")[0] |
|
elif language == 'es': |
|
solution = output_tmp.split("Responder: ")[2].split("\n")[0] |
|
elif language == 'fr': |
|
solution = output_tmp.split("Réponse: ")[2].split("\n")[0] |
|
|
|
print(f"Final Bloom Response after splits is: {solution}") |
|
return solution |
|
|
|
|
|
def tts(text, language): |
|
print(f"Inside tts - language is : {language}") |
|
coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga'] |
|
if language not in coqui_langs: |
|
language = 'en' |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
coquiTTS.get_tts(text, fp, speaker = {"language" : language}) |
|
return fp.name |
|
|
|
demo = gr.Blocks() |
|
with demo: |
|
gr.Markdown("<h1><center>Talk to Your Multilingual AI Assistant</center></h1>") |
|
gr.Markdown( |
|
"""Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper)for Speech-to-text, <br>- [**Bloom**](https://huggingface.co/bigscience/bloom) for Text-generation, and <br>- [**CoquiTTS**](https://huggingface.co/coqui) for Text-To-Speech. <br><br> Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br>All three models are Multilingual, however, there are only these three overlapping languages among them - Spanish (es), French(fr), and English(en). Hence it would be suggested to test using these languages to get the best results out of this ML-App. If an English voice input is given then both the textbox on the left-hand side would show the same transcripts. However, if the input is either in _Spanish_ or _French_, then the first textbox would show the language transcript, while the next one would show its English translations. |
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice here in English, Spanish or French for best results-') |
|
b1 = gr.Button("AI response pipeline (Whisper - Bloom - Coqui pipeline)") |
|
out_transcript = gr.Textbox(label= 'English/Spanish/French Transcript of your Audio using OpenAI Whisper') |
|
out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper') |
|
with gr.Column(): |
|
out_audio = gr.Audio(label='AI response in Audio form in your preferred language') |
|
out_generated_text = gr.Textbox(label= 'AI response to your query in your preferred language using Bloom! ') |
|
out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ') |
|
|
|
b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_translation_en, out_generated_text,out_generated_text_en, out_audio]) |
|
|
|
demo.launch(enable_queue=True, debug=True) |