File size: 7,209 Bytes
e50e0dc 51c1138 e50e0dc 51c1138 e50e0dc abb4bd0 e50e0dc abb4bd0 e50e0dc abb4bd0 e50e0dc abb4bd0 e50e0dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import os
import gradio as gr
import whisper
import requests
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
# Language common in all three multilingual models - English, Chinese, Spanish, and French
# So it would make sense to test the App on these four prominently
# Whisper: Speech-to-text
model = whisper.load_model("base")
model_med = whisper.load_model("medium")
# Languages covered in Whisper - (exhaustive list) :
#"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian",
#"ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish",
#"pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish",
#"it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese",
#"iw": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech",
#"ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian",
#"th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian",
#"la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak",
#"te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian",
#"az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian",
#"mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian",
#"ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian",
#"sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala",
#"km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans",
#"oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi",
#"gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek",
#"fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk",
#"mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan",
#"tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian",
#"ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese",
# LLM : Bloom as inference
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
# Main Languages covered in Bloom are (not exhaustive list):
# English, Chinese, French, Spanish, Portuguese, Arabic, Hindi, Vietnamese, Indonesian, Bengali, Tamil, Telugu
# Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
# en - Engish, es - Spanish, fr - French, de - German, pl - Polish
# uk - Ukrainian, ro - Romanian, hu - Hungarian, el - Greek, bg - Bulgarian,
# nl - dutch, fi - finnish, sl - slovenian, lv - latvian, ga - ??
# Driver function
def driver_fun(audio) :
transcribe, translation, lang = whisper_stt(audio)
#text1 = model.transcribe(audio)["text"]
#For now only taking in English text for Bloom prompting as inference model is not high spec
text_generated = lang_model_response(transcribe, lang)
text_generated_en = lang_model_response(translation, 'en')
if lang in ['es', 'fr']:
speech = tts(text_generated, lang)
else:
speech = tts(text_generated_en, 'en') #'en')
return transcribe, translation, text_generated, text_generated_en, speech
# Whisper - speech-to-text
def whisper_stt(audio):
print("Inside Whisper TTS")
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
lang = max(probs, key=probs.get)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
result_transc = whisper.decode(model_med, mel, options_transc)
result_transl = whisper.decode(model_med, mel, options_transl)
# print the recognized text
print(f"transcript is : {result_transc.text}")
print(f"translation is : {result_transl.text}")
# decode the audio
#options = whisper.DecodingOptions(fp16 = False, language='en') #lang
#result = whisper.decode(model, mel, options)
# print the recognized text
# print(f"transcript is : {result.text}")
# return result.text, lang
return result_transc.text, result_transl.text, lang
# LLM - Bloom Response
def lang_model_response(prompt, language):
print(f"Inside lang_model_response - Prompt is :{prompt}")
p_en = """Question: How are you doing today?
Answer: I am doing good, thanks.
Question: """
p_es = """Pregunta: Cómo estás hoy?
Responder: Estoy bien, gracias.
Pregunta: """
p_fr = """Question: Comment vas-tu aujourd'hui?
Réponse: Je vais bien, merci.
Question: """
if len(prompt) == 0:
prompt = """Question: Can you help me please?
Answer: Sure, I am here for you.
Question: """
if language == 'en':
prompt = p_en + prompt + "\n" + "Answer: "
elif language == 'es':
prompt = p_es + prompt + "\n" + "Responder: "
elif language == 'fr':
prompt = p_fr + prompt + "\n" + "Réponse: "
json_ = {"inputs": prompt,
"parameters":
{
"top_p": 0.90, #0.90 default
"max_new_tokens": 64,
"temperature": 1.1, #1.1 default
"return_full_text": False,
"do_sample": True,
},
"options":
{"use_cache": True,
"wait_for_model": True,
},}
response = requests.post(API_URL, headers=headers, json=json_)
#print(f"Response is : {response}")
output = response.json()
output_tmp = output[0]['generated_text']
print(f"Bloom API Response is : {output_tmp}")
if language == 'en':
solution = output_tmp.split("Answer: ")[2].split("\n")[0]
else:
solution = output_tmp.split(".")[1]
print(f"Final Bloom Response after splits is: {solution}")
return solution
# Coqui - Text-to-Speech
def tts(text, language):
print(f"Inside tts - language is : {language}")
coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
if language not in coqui_langs:
language = 'en'
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text, fp, speaker = {"language" : language})
return fp.name
#demo = gr.Blocks()
#with demo:
# gr.Markdown("<h1><center>Testing</center></h1>")
gr.Interface(
title = 'Testing Whisper',
fn=driver_fun,
inputs=[
gr.Audio(source="microphone", type="filepath"), #streaming = True,
# "state"
],
outputs=[
"textbox", "textbox", "textbox", "textbox", "audio",
],
live=True).launch()
|