srinivasbilla commited on
Commit
d86fff3
Β·
verified Β·
1 Parent(s): a489b73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -5
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import spaces
2
-
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
  import torch
5
  import soundfile as sf
@@ -33,7 +32,6 @@ whisper_turbo_pipe = pipeline(
33
  device='cuda',
34
  )
35
 
36
-
37
  def ids_to_speech_tokens(speech_ids):
38
 
39
  speech_tokens_str = []
@@ -55,9 +53,9 @@ def extract_speech_ids(speech_tokens_str):
55
  return speech_ids
56
 
57
  @spaces.GPU(duration=120)
58
- def infer(sample_audio_path, target_text):
59
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
60
-
61
  waveform, sample_rate = torchaudio.load(sample_audio_path)
62
  if len(waveform[0])/sample_rate > 15:
63
  gr.Warning("Trimming audio to first 15secs.")
@@ -73,6 +71,7 @@ def infer(sample_audio_path, target_text):
73
 
74
  prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
75
  prompt_text = whisper_turbo_pipe(prompt_wav[0].numpy())['text'].strip()
 
76
 
77
  input_text = prompt_text + ' ' + target_text
78
 
@@ -127,7 +126,9 @@ def infer(sample_audio_path, target_text):
127
  # if only need the generated part
128
  gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
129
 
130
- return gen_wav[0, 0, :].cpu().numpy()
 
 
131
 
132
  with gr.Blocks() as app_tts:
133
  gr.Markdown("# Zero Shot Voice Clone TTS")
 
1
  import spaces
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
  import torch
4
  import soundfile as sf
 
32
  device='cuda',
33
  )
34
 
 
35
  def ids_to_speech_tokens(speech_ids):
36
 
37
  speech_tokens_str = []
 
53
  return speech_ids
54
 
55
  @spaces.GPU(duration=120)
56
+ def infer(sample_audio_path, target_text, progress=gr.Progress()):
57
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
58
+ progress(0, 'Loading and trimming audio...')
59
  waveform, sample_rate = torchaudio.load(sample_audio_path)
60
  if len(waveform[0])/sample_rate > 15:
61
  gr.Warning("Trimming audio to first 15secs.")
 
71
 
72
  prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
73
  prompt_text = whisper_turbo_pipe(prompt_wav[0].numpy())['text'].strip()
74
+ progress(0.5, 'Transcribed! Generating speech...')
75
 
76
  input_text = prompt_text + ' ' + target_text
77
 
 
126
  # if only need the generated part
127
  gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
128
 
129
+ progress(1, 'Synthesized!')
130
+
131
+ return (16000, gen_wav[0, 0, :].cpu().numpy())
132
 
133
  with gr.Blocks() as app_tts:
134
  gr.Markdown("# Zero Shot Voice Clone TTS")