llasa-3b-tts

Runtime error

srinivasbilla commited on 14 days ago

Commit

d86fff3

verified ·

1 Parent(s): a489b73

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 import soundfile as sf
@@ -33,7 +32,6 @@ whisper_turbo_pipe = pipeline(
     device='cuda',
 )
 def ids_to_speech_tokens(speech_ids):
     speech_tokens_str = []
@@ -55,9 +53,9 @@ def extract_speech_ids(speech_tokens_str):
     return speech_ids
 @spaces.GPU(duration=120)
-def infer(sample_audio_path, target_text):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         waveform, sample_rate = torchaudio.load(sample_audio_path)
         if len(waveform[0])/sample_rate > 15:
             gr.Warning("Trimming audio to first 15secs.")
@@ -73,6 +71,7 @@ def infer(sample_audio_path, target_text):
         prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
         prompt_text = whisper_turbo_pipe(prompt_wav[0].numpy())['text'].strip()
         input_text = prompt_text + ' ' + target_text
@@ -127,7 +126,9 @@ def infer(sample_audio_path, target_text):
             # if only need the generated part
             gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
-        return gen_wav[0, 0, :].cpu().numpy()
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")

 import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 import soundfile as sf
     device='cuda',
 )
 def ids_to_speech_tokens(speech_ids):
     speech_tokens_str = []
     return speech_ids
 @spaces.GPU(duration=120)
+def infer(sample_audio_path, target_text, progress=gr.Progress()):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+        progress(0, 'Loading and trimming audio...')
         waveform, sample_rate = torchaudio.load(sample_audio_path)
         if len(waveform[0])/sample_rate > 15:
             gr.Warning("Trimming audio to first 15secs.")
         prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
         prompt_text = whisper_turbo_pipe(prompt_wav[0].numpy())['text'].strip()
+        progress(0.5, 'Transcribed! Generating speech...')
         input_text = prompt_text + ' ' + target_text
             # if only need the generated part
             gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
+            progress(1, 'Synthesized!')
+        return (16000, gen_wav[0, 0, :].cpu().numpy())
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")