hexgrad commited on
Commit
6ee662e
·
verified ·
1 Parent(s): 0983170

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -16
app.py CHANGED
@@ -204,7 +204,8 @@ def tokenize(ps):
204
  SAMPLE_RATE = 24000
205
 
206
  @torch.no_grad()
207
- def forward(tokens, voices, speed, device='cpu'):
 
208
  ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
209
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
210
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
@@ -229,8 +230,8 @@ def forward(tokens, voices, speed, device='cpu'):
229
  return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
230
 
231
  @spaces.GPU(duration=10)
232
- def forward_gpu(tokens, voices, speed):
233
- return forward(tokens, voices, speed, device='cuda')
234
 
235
  def clamp_speed(speed):
236
  if not isinstance(speed, float) and not isinstance(speed, int):
@@ -257,18 +258,18 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto', sk=N
257
  ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
258
  use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
259
  if sk != os.environ['SK']:
260
- print('❌', datetime.now(), text, voices, ps, use_gpu)
261
  return (None, '')
262
  try:
263
  if use_gpu:
264
- out = forward_gpu(tokens, voices, speed)
265
  else:
266
- out = forward(tokens, voices, speed)
267
  except gr.exceptions.Error as e:
268
  if use_gpu:
269
  gr.Warning(str(e))
270
  gr.Info('Switching to CPU')
271
- out = forward(tokens, voices, speed)
272
  else:
273
  raise gr.Error(e)
274
  print('🔥', datetime.now(), len(ps), use_gpu, repr(e))
@@ -342,7 +343,8 @@ with gr.Blocks() as basic_tts:
342
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
343
 
344
  @torch.no_grad()
345
- def lf_forward(token_lists, voices, speed, device='cpu'):
 
346
  voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
347
  outs = []
348
  for tokens in token_lists:
@@ -371,8 +373,8 @@ def lf_forward(token_lists, voices, speed, device='cpu'):
371
  return outs
372
 
373
  @spaces.GPU
374
- def lf_forward_gpu(token_lists, voices, speed):
375
- return lf_forward(token_lists, voices, speed, device='cuda')
376
 
377
  def resplit_strings(arr):
378
  # Handle edge cases
@@ -426,7 +428,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
426
  segments = [row for t in texts for row in recursive_split(t, voice)]
427
  return [(i, *row) for i, row in enumerate(segments)]
428
 
429
- def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
430
  token_lists = list(map(tokenize, segments['Tokens']))
431
  voices = resolve_voices(voice)
432
  speed = clamp_speed(speed)
@@ -435,20 +437,23 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
435
  use_gpu = True
436
  batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
437
  i = 0
 
 
 
438
  while i < len(token_lists):
439
  bs = batch_sizes.pop() if batch_sizes else 100
440
  tokens = token_lists[i:i+bs]
441
  print('📖', datetime.now(), len(tokens), voices, use_gpu)
442
  try:
443
  if use_gpu:
444
- outs = lf_forward_gpu(tokens, voices, speed)
445
  else:
446
- outs = lf_forward(tokens, voices, speed)
447
  except gr.exceptions.Error as e:
448
  if use_gpu:
449
  gr.Warning(str(e))
450
  gr.Info('Switching to CPU')
451
- outs = lf_forward(tokens, voices, speed)
452
  use_gpu = False
453
  elif outs:
454
  gr.Warning(repr(e))
@@ -513,8 +518,11 @@ with gr.Blocks() as lf_tts:
513
  with gr.Row():
514
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
515
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
 
 
 
516
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
517
- generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio_stream])
518
  stop_btn.click(fn=None, cancels=generate_event)
519
 
520
  with gr.Blocks() as about:
@@ -539,7 +547,8 @@ Vast was chosen over other compute providers due to its competitive on-demand ho
539
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
540
 
541
  ### Gradio API
542
- This Space can be used via API. The following code block can be copied and run in one Google Colab cell.
 
543
  ```
544
  # 1️⃣ Install the Gradio Python client
545
  !pip install -q gradio_client
@@ -569,6 +578,7 @@ Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/
569
  with gr.Blocks() as changelog:
570
  gr.Markdown('''
571
  **28 Nov 2024**<br/>
 
572
  🌊 Long Form streaming and stop button
573
 
574
  **25 Nov 2024**<br/>
 
204
  SAMPLE_RATE = 24000
205
 
206
  @torch.no_grad()
207
+ def forward(tokens, voices, speed, sk, device='cpu'):
208
+ assert sk == os.environ['SK'], sk
209
  ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
210
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
211
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
 
230
  return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
231
 
232
  @spaces.GPU(duration=10)
233
+ def forward_gpu(tokens, voices, speed, sk):
234
+ return forward(tokens, voices, speed, sk, device='cuda')
235
 
236
  def clamp_speed(speed):
237
  if not isinstance(speed, float) and not isinstance(speed, int):
 
258
  ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
259
  use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
260
  if sk != os.environ['SK']:
261
+ print('❌', datetime.now(), text, voices, ps, sk)
262
  return (None, '')
263
  try:
264
  if use_gpu:
265
+ out = forward_gpu(tokens, voices, speed, sk)
266
  else:
267
+ out = forward(tokens, voices, speed, sk)
268
  except gr.exceptions.Error as e:
269
  if use_gpu:
270
  gr.Warning(str(e))
271
  gr.Info('Switching to CPU')
272
+ out = forward(tokens, voices, speed, sk)
273
  else:
274
  raise gr.Error(e)
275
  print('🔥', datetime.now(), len(ps), use_gpu, repr(e))
 
343
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
344
 
345
  @torch.no_grad()
346
+ def lf_forward(token_lists, voices, speed, sk, device='cpu'):
347
+ assert sk == os.environ['SK'], sk
348
  voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
349
  outs = []
350
  for tokens in token_lists:
 
373
  return outs
374
 
375
  @spaces.GPU
376
+ def lf_forward_gpu(token_lists, voices, speed, sk):
377
+ return lf_forward(token_lists, voices, speed, sk, device='cuda')
378
 
379
  def resplit_strings(arr):
380
  # Handle edge cases
 
428
  segments = [row for t in texts for row in recursive_split(t, voice)]
429
  return [(i, *row) for i, row in enumerate(segments)]
430
 
431
+ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, sk=None):
432
  token_lists = list(map(tokenize, segments['Tokens']))
433
  voices = resolve_voices(voice)
434
  speed = clamp_speed(speed)
 
437
  use_gpu = True
438
  batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
439
  i = 0
440
+ if sk != os.environ['SK']:
441
+ print('❌', datetime.now(), len(segments), voices, sk)
442
+ return
443
  while i < len(token_lists):
444
  bs = batch_sizes.pop() if batch_sizes else 100
445
  tokens = token_lists[i:i+bs]
446
  print('📖', datetime.now(), len(tokens), voices, use_gpu)
447
  try:
448
  if use_gpu:
449
+ outs = lf_forward_gpu(tokens, voices, speed, sk)
450
  else:
451
+ outs = lf_forward(tokens, voices, speed, sk)
452
  except gr.exceptions.Error as e:
453
  if use_gpu:
454
  gr.Warning(str(e))
455
  gr.Info('Switching to CPU')
456
+ outs = lf_forward(tokens, voices, speed, sk)
457
  use_gpu = False
458
  elif outs:
459
  gr.Warning(repr(e))
 
518
  with gr.Row():
519
  segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
520
  segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
521
+ with gr.Row():
522
+ sk = gr.Textbox(visible=False)
523
+ segments.change(lambda: os.environ['SK'], outputs=[sk])
524
  segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
525
+ generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu, sk], outputs=[audio_stream])
526
  stop_btn.click(fn=None, cancels=generate_event)
527
 
528
  with gr.Blocks() as about:
 
547
  The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
548
 
549
  ### Gradio API
550
+ **The API has been restricted due to high request volume degrading the demo experience.**
551
+ ~~This Space can be used via API. The following code block can be copied and run in one Google Colab cell.~~
552
  ```
553
  # 1️⃣ Install the Gradio Python client
554
  !pip install -q gradio_client
 
578
  with gr.Blocks() as changelog:
579
  gr.Markdown('''
580
  **28 Nov 2024**<br/>
581
+ 🥈 CPU fallback<br/>
582
  🌊 Long Form streaming and stop button
583
 
584
  **25 Nov 2024**<br/>