Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -204,7 +204,8 @@ def tokenize(ps):
|
|
204 |
SAMPLE_RATE = 24000
|
205 |
|
206 |
@torch.no_grad()
|
207 |
-
def forward(tokens, voices, speed, device='cpu'):
|
|
|
208 |
ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
|
209 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
210 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
@@ -229,8 +230,8 @@ def forward(tokens, voices, speed, device='cpu'):
|
|
229 |
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
230 |
|
231 |
@spaces.GPU(duration=10)
|
232 |
-
def forward_gpu(tokens, voices, speed):
|
233 |
-
return forward(tokens, voices, speed, device='cuda')
|
234 |
|
235 |
def clamp_speed(speed):
|
236 |
if not isinstance(speed, float) and not isinstance(speed, int):
|
@@ -257,18 +258,18 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto', sk=N
|
|
257 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
258 |
use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
|
259 |
if sk != os.environ['SK']:
|
260 |
-
print('❌', datetime.now(), text, voices, ps,
|
261 |
return (None, '')
|
262 |
try:
|
263 |
if use_gpu:
|
264 |
-
out = forward_gpu(tokens, voices, speed)
|
265 |
else:
|
266 |
-
out = forward(tokens, voices, speed)
|
267 |
except gr.exceptions.Error as e:
|
268 |
if use_gpu:
|
269 |
gr.Warning(str(e))
|
270 |
gr.Info('Switching to CPU')
|
271 |
-
out = forward(tokens, voices, speed)
|
272 |
else:
|
273 |
raise gr.Error(e)
|
274 |
print('🔥', datetime.now(), len(ps), use_gpu, repr(e))
|
@@ -342,7 +343,8 @@ with gr.Blocks() as basic_tts:
|
|
342 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
|
343 |
|
344 |
@torch.no_grad()
|
345 |
-
def lf_forward(token_lists, voices, speed, device='cpu'):
|
|
|
346 |
voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
|
347 |
outs = []
|
348 |
for tokens in token_lists:
|
@@ -371,8 +373,8 @@ def lf_forward(token_lists, voices, speed, device='cpu'):
|
|
371 |
return outs
|
372 |
|
373 |
@spaces.GPU
|
374 |
-
def lf_forward_gpu(token_lists, voices, speed):
|
375 |
-
return lf_forward(token_lists, voices, speed, device='cuda')
|
376 |
|
377 |
def resplit_strings(arr):
|
378 |
# Handle edge cases
|
@@ -426,7 +428,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
426 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
427 |
return [(i, *row) for i, row in enumerate(segments)]
|
428 |
|
429 |
-
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
430 |
token_lists = list(map(tokenize, segments['Tokens']))
|
431 |
voices = resolve_voices(voice)
|
432 |
speed = clamp_speed(speed)
|
@@ -435,20 +437,23 @@ def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
|
435 |
use_gpu = True
|
436 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
437 |
i = 0
|
|
|
|
|
|
|
438 |
while i < len(token_lists):
|
439 |
bs = batch_sizes.pop() if batch_sizes else 100
|
440 |
tokens = token_lists[i:i+bs]
|
441 |
print('📖', datetime.now(), len(tokens), voices, use_gpu)
|
442 |
try:
|
443 |
if use_gpu:
|
444 |
-
outs = lf_forward_gpu(tokens, voices, speed)
|
445 |
else:
|
446 |
-
outs = lf_forward(tokens, voices, speed)
|
447 |
except gr.exceptions.Error as e:
|
448 |
if use_gpu:
|
449 |
gr.Warning(str(e))
|
450 |
gr.Info('Switching to CPU')
|
451 |
-
outs = lf_forward(tokens, voices, speed)
|
452 |
use_gpu = False
|
453 |
elif outs:
|
454 |
gr.Warning(repr(e))
|
@@ -513,8 +518,11 @@ with gr.Blocks() as lf_tts:
|
|
513 |
with gr.Row():
|
514 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
515 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
|
|
|
|
|
|
516 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
517 |
-
generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu], outputs=[audio_stream])
|
518 |
stop_btn.click(fn=None, cancels=generate_event)
|
519 |
|
520 |
with gr.Blocks() as about:
|
@@ -539,7 +547,8 @@ Vast was chosen over other compute providers due to its competitive on-demand ho
|
|
539 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
540 |
|
541 |
### Gradio API
|
542 |
-
|
|
|
543 |
```
|
544 |
# 1️⃣ Install the Gradio Python client
|
545 |
!pip install -q gradio_client
|
@@ -569,6 +578,7 @@ Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/
|
|
569 |
with gr.Blocks() as changelog:
|
570 |
gr.Markdown('''
|
571 |
**28 Nov 2024**<br/>
|
|
|
572 |
🌊 Long Form streaming and stop button
|
573 |
|
574 |
**25 Nov 2024**<br/>
|
|
|
204 |
SAMPLE_RATE = 24000
|
205 |
|
206 |
@torch.no_grad()
|
207 |
+
def forward(tokens, voices, speed, sk, device='cpu'):
|
208 |
+
assert sk == os.environ['SK'], sk
|
209 |
ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
|
210 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
211 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
|
|
230 |
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
231 |
|
232 |
@spaces.GPU(duration=10)
|
233 |
+
def forward_gpu(tokens, voices, speed, sk):
|
234 |
+
return forward(tokens, voices, speed, sk, device='cuda')
|
235 |
|
236 |
def clamp_speed(speed):
|
237 |
if not isinstance(speed, float) and not isinstance(speed, int):
|
|
|
258 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
259 |
use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
|
260 |
if sk != os.environ['SK']:
|
261 |
+
print('❌', datetime.now(), text, voices, ps, sk)
|
262 |
return (None, '')
|
263 |
try:
|
264 |
if use_gpu:
|
265 |
+
out = forward_gpu(tokens, voices, speed, sk)
|
266 |
else:
|
267 |
+
out = forward(tokens, voices, speed, sk)
|
268 |
except gr.exceptions.Error as e:
|
269 |
if use_gpu:
|
270 |
gr.Warning(str(e))
|
271 |
gr.Info('Switching to CPU')
|
272 |
+
out = forward(tokens, voices, speed, sk)
|
273 |
else:
|
274 |
raise gr.Error(e)
|
275 |
print('🔥', datetime.now(), len(ps), use_gpu, repr(e))
|
|
|
343 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu, sk], outputs=[audio, out_ps])
|
344 |
|
345 |
@torch.no_grad()
|
346 |
+
def lf_forward(token_lists, voices, speed, sk, device='cpu'):
|
347 |
+
assert sk == os.environ['SK'], sk
|
348 |
voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
|
349 |
outs = []
|
350 |
for tokens in token_lists:
|
|
|
373 |
return outs
|
374 |
|
375 |
@spaces.GPU
|
376 |
+
def lf_forward_gpu(token_lists, voices, speed, sk):
|
377 |
+
return lf_forward(token_lists, voices, speed, sk, device='cuda')
|
378 |
|
379 |
def resplit_strings(arr):
|
380 |
# Handle edge cases
|
|
|
428 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
429 |
return [(i, *row) for i, row in enumerate(segments)]
|
430 |
|
431 |
+
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True, sk=None):
|
432 |
token_lists = list(map(tokenize, segments['Tokens']))
|
433 |
voices = resolve_voices(voice)
|
434 |
speed = clamp_speed(speed)
|
|
|
437 |
use_gpu = True
|
438 |
batch_sizes = [89, 55, 34, 21, 13, 8, 5, 3, 2, 1, 1]
|
439 |
i = 0
|
440 |
+
if sk != os.environ['SK']:
|
441 |
+
print('❌', datetime.now(), len(segments), voices, sk)
|
442 |
+
return
|
443 |
while i < len(token_lists):
|
444 |
bs = batch_sizes.pop() if batch_sizes else 100
|
445 |
tokens = token_lists[i:i+bs]
|
446 |
print('📖', datetime.now(), len(tokens), voices, use_gpu)
|
447 |
try:
|
448 |
if use_gpu:
|
449 |
+
outs = lf_forward_gpu(tokens, voices, speed, sk)
|
450 |
else:
|
451 |
+
outs = lf_forward(tokens, voices, speed, sk)
|
452 |
except gr.exceptions.Error as e:
|
453 |
if use_gpu:
|
454 |
gr.Warning(str(e))
|
455 |
gr.Info('Switching to CPU')
|
456 |
+
outs = lf_forward(tokens, voices, speed, sk)
|
457 |
use_gpu = False
|
458 |
elif outs:
|
459 |
gr.Warning(repr(e))
|
|
|
518 |
with gr.Row():
|
519 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
520 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
521 |
+
with gr.Row():
|
522 |
+
sk = gr.Textbox(visible=False)
|
523 |
+
segments.change(lambda: os.environ['SK'], outputs=[sk])
|
524 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
525 |
+
generate_event = generate_btn.click(lf_generate, inputs=[segments, voice, speed, trim, pad_between, use_gpu, sk], outputs=[audio_stream])
|
526 |
stop_btn.click(fn=None, cancels=generate_event)
|
527 |
|
528 |
with gr.Blocks() as about:
|
|
|
547 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
548 |
|
549 |
### Gradio API
|
550 |
+
**The API has been restricted due to high request volume degrading the demo experience.**
|
551 |
+
~~This Space can be used via API. The following code block can be copied and run in one Google Colab cell.~~
|
552 |
```
|
553 |
# 1️⃣ Install the Gradio Python client
|
554 |
!pip install -q gradio_client
|
|
|
578 |
with gr.Blocks() as changelog:
|
579 |
gr.Markdown('''
|
580 |
**28 Nov 2024**<br/>
|
581 |
+
🥈 CPU fallback<br/>
|
582 |
🌊 Long Form streaming and stop button
|
583 |
|
584 |
**25 Nov 2024**<br/>
|