hexgrad commited on
Commit
2acb6ca
·
verified ·
1 Parent(s): 76f722d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -26
app.py CHANGED
@@ -157,7 +157,7 @@ def forward(tokens, voice, speed):
157
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
158
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
159
 
160
- def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000):
161
  ps = ps or phonemize(text, voice)
162
  tokens = tokenize(ps)
163
  if not tokens:
@@ -172,18 +172,24 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000
172
  return (None, '')
173
  if reduce_noise > 0:
174
  out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
175
- opening_cut = max(0, int(opening_cut / speed))
176
  if opening_cut > 0:
177
- out[:opening_cut] = 0
178
- closing_cut = max(0, int(closing_cut / speed))
179
  if closing_cut > 0:
180
- out[-closing_cut:] = 0
181
- ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
182
  for i in range(ease_in):
183
- out[i+opening_cut] *= s_curve(i / ease_in)
184
- ease_out = min(int(ease_out / speed), len(out)//2 - closing_cut)
185
  for i in range(ease_out):
186
- out[-i-1-closing_cut] *= s_curve(i / ease_out)
 
 
 
 
 
 
187
  return ((SAMPLE_RATE, out), ps)
188
 
189
  with gr.Blocks() as basic_tts:
@@ -212,15 +218,20 @@ with gr.Blocks() as basic_tts:
212
  speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
213
  with gr.Row():
214
  with gr.Column():
215
- opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
216
  with gr.Column():
217
- closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
218
  with gr.Row():
219
  with gr.Column():
220
  ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
221
  with gr.Column():
222
  ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
223
- generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
 
 
 
 
 
224
 
225
  @spaces.GPU
226
  @torch.no_grad()
@@ -303,12 +314,13 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
303
  segments = [row for t in texts for row in recursive_split(t, voice)]
304
  return [(i, *row) for i, row in enumerate(segments)]
305
 
306
- def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad=5000):
307
  token_lists = list(map(tokenize, segments['Tokens']))
308
  wavs = []
309
  opening_cut = max(0, int(opening_cut / speed))
310
  closing_cut = max(0, int(closing_cut / speed))
311
- pad = max(0, int(pad / speed))
 
312
  batch_size = 100
313
  for i in range(0, len(token_lists), batch_size):
314
  try:
@@ -323,18 +335,20 @@ def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000,
323
  if reduce_noise > 0:
324
  out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
325
  if opening_cut > 0:
326
- out[:opening_cut] = 0
327
  if closing_cut > 0:
328
- out[-closing_cut:] = 0
329
- ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
330
  for i in range(ease_in):
331
- out[i+opening_cut] *= s_curve(i / ease_in)
332
- ease_out = min(int(ease_out / speed), len(out)//2 - closing_cut)
333
  for i in range(ease_out):
334
- out[-i-1-closing_cut] *= s_curve(i / ease_out)
335
- if wavs and pad > 0:
336
- wavs.append(np.zeros(pad))
337
  wavs.append(out)
 
 
338
  return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
339
 
340
  def did_change_segments(segments):
@@ -376,21 +390,24 @@ with gr.Blocks() as lf_tts:
376
  speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
377
  with gr.Row():
378
  with gr.Column():
379
- opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
380
  with gr.Column():
381
- closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
382
  with gr.Row():
383
  with gr.Column():
384
  ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
385
  with gr.Column():
386
  ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
387
  with gr.Row():
388
- pad = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad', info='🔇 How many samples of silence to insert between segments.')
 
 
 
389
  with gr.Row():
390
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
391
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
392
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
393
- generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad], outputs=[audio])
394
 
395
  with gr.Blocks() as app:
396
  gr.TabbedInterface(
 
157
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
158
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
159
 
160
+ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
161
  ps = ps or phonemize(text, voice)
162
  tokens = tokenize(ps)
163
  if not tokens:
 
172
  return (None, '')
173
  if reduce_noise > 0:
174
  out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
175
+ opening_cut = int(opening_cut / speed)
176
  if opening_cut > 0:
177
+ out = out[opening_cut:]
178
+ closing_cut = int(closing_cut / speed)
179
  if closing_cut > 0:
180
+ out = out[:-closing_cut]
181
+ ease_in = min(int(ease_in / speed), len(out)//2)
182
  for i in range(ease_in):
183
+ out[i] *= s_curve(i / ease_in)
184
+ ease_out = min(int(ease_out / speed), len(out)//2)
185
  for i in range(ease_out):
186
+ out[-i-1] *= s_curve(i / ease_out)
187
+ pad_before = max(0, int(pad_before / speed))
188
+ if pad_before > 0:
189
+ out = np.concatenate([np.zeros(pad_before), out])
190
+ pad_after = max(0, int(pad_after / speed))
191
+ if pad_after > 0:
192
+ out = np.concatenate([out, np.zeros(pad_after)])
193
  return ((SAMPLE_RATE, out), ps)
194
 
195
  with gr.Blocks() as basic_tts:
 
218
  speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
219
  with gr.Row():
220
  with gr.Column():
221
+ opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Cut this many samples from the start.')
222
  with gr.Column():
223
+ closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Cut this many samples from the end.')
224
  with gr.Row():
225
  with gr.Column():
226
  ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
227
  with gr.Column():
228
  ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
229
+ with gr.Row():
230
+ with gr.Column():
231
+ pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
232
+ with gr.Column():
233
+ pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
234
+ generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
235
 
236
  @spaces.GPU
237
  @torch.no_grad()
 
314
  segments = [row for t in texts for row in recursive_split(t, voice)]
315
  return [(i, *row) for i, row in enumerate(segments)]
316
 
317
+ def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
318
  token_lists = list(map(tokenize, segments['Tokens']))
319
  wavs = []
320
  opening_cut = max(0, int(opening_cut / speed))
321
  closing_cut = max(0, int(closing_cut / speed))
322
+ pad_before = max(0, int(pad_before / speed))
323
+ pad_after = max(0, int(pad_after / speed))
324
  batch_size = 100
325
  for i in range(0, len(token_lists), batch_size):
326
  try:
 
335
  if reduce_noise > 0:
336
  out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
337
  if opening_cut > 0:
338
+ out = out[opening_cut:]
339
  if closing_cut > 0:
340
+ out = out[:-closing_cut]
341
+ ease_in = min(int(ease_in / speed), len(out)//2)
342
  for i in range(ease_in):
343
+ out[i] *= s_curve(i / ease_in)
344
+ ease_out = min(int(ease_out / speed), len(out)//2)
345
  for i in range(ease_out):
346
+ out[-i-1] *= s_curve(i / ease_out)
347
+ if pad_before > 0:
348
+ wavs.append(np.zeros(pad_before))
349
  wavs.append(out)
350
+ if pad_after > 0:
351
+ wavs.append(np.zeros(pad_after))
352
  return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
353
 
354
  def did_change_segments(segments):
 
390
  speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
391
  with gr.Row():
392
  with gr.Column():
393
+ opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Cut this many samples from the start.')
394
  with gr.Column():
395
+ closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Cut this many samples from the end.')
396
  with gr.Row():
397
  with gr.Column():
398
  ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
399
  with gr.Column():
400
  ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
401
  with gr.Row():
402
+ with gr.Column():
403
+ pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before each segment.')
404
+ with gr.Column():
405
+ pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after each segment.')
406
  with gr.Row():
407
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
408
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
409
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
410
+ generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio])
411
 
412
  with gr.Blocks() as app:
413
  gr.TabbedInterface(