Spaces:
Runtime error
Runtime error
srinivasbilla
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import spaces
|
2 |
-
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
4 |
import torch
|
5 |
import soundfile as sf
|
@@ -33,7 +32,6 @@ whisper_turbo_pipe = pipeline(
|
|
33 |
device='cuda',
|
34 |
)
|
35 |
|
36 |
-
|
37 |
def ids_to_speech_tokens(speech_ids):
|
38 |
|
39 |
speech_tokens_str = []
|
@@ -55,9 +53,9 @@ def extract_speech_ids(speech_tokens_str):
|
|
55 |
return speech_ids
|
56 |
|
57 |
@spaces.GPU(duration=120)
|
58 |
-
def infer(sample_audio_path, target_text):
|
59 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
60 |
-
|
61 |
waveform, sample_rate = torchaudio.load(sample_audio_path)
|
62 |
if len(waveform[0])/sample_rate > 15:
|
63 |
gr.Warning("Trimming audio to first 15secs.")
|
@@ -73,6 +71,7 @@ def infer(sample_audio_path, target_text):
|
|
73 |
|
74 |
prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
|
75 |
prompt_text = whisper_turbo_pipe(prompt_wav[0].numpy())['text'].strip()
|
|
|
76 |
|
77 |
input_text = prompt_text + ' ' + target_text
|
78 |
|
@@ -127,7 +126,9 @@ def infer(sample_audio_path, target_text):
|
|
127 |
# if only need the generated part
|
128 |
gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
|
129 |
|
130 |
-
|
|
|
|
|
131 |
|
132 |
with gr.Blocks() as app_tts:
|
133 |
gr.Markdown("# Zero Shot Voice Clone TTS")
|
|
|
1 |
import spaces
|
|
|
2 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
3 |
import torch
|
4 |
import soundfile as sf
|
|
|
32 |
device='cuda',
|
33 |
)
|
34 |
|
|
|
35 |
def ids_to_speech_tokens(speech_ids):
|
36 |
|
37 |
speech_tokens_str = []
|
|
|
53 |
return speech_ids
|
54 |
|
55 |
@spaces.GPU(duration=120)
|
56 |
+
def infer(sample_audio_path, target_text, progress=gr.Progress()):
|
57 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
58 |
+
progress(0, 'Loading and trimming audio...')
|
59 |
waveform, sample_rate = torchaudio.load(sample_audio_path)
|
60 |
if len(waveform[0])/sample_rate > 15:
|
61 |
gr.Warning("Trimming audio to first 15secs.")
|
|
|
71 |
|
72 |
prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
|
73 |
prompt_text = whisper_turbo_pipe(prompt_wav[0].numpy())['text'].strip()
|
74 |
+
progress(0.5, 'Transcribed! Generating speech...')
|
75 |
|
76 |
input_text = prompt_text + ' ' + target_text
|
77 |
|
|
|
126 |
# if only need the generated part
|
127 |
gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
|
128 |
|
129 |
+
progress(1, 'Synthesized!')
|
130 |
+
|
131 |
+
return (16000, gen_wav[0, 0, :].cpu().numpy())
|
132 |
|
133 |
with gr.Blocks() as app_tts:
|
134 |
gr.Markdown("# Zero Shot Voice Clone TTS")
|