Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -142,17 +142,31 @@ VOCAB = get_vocab()
|
|
142 |
def tokenize(ps):
|
143 |
return [i for i in map(VOCAB.get, ps) if i is not None]
|
144 |
|
145 |
-
#
|
146 |
CHOICES = {
|
147 |
'🇺🇸 🚺 American Female ⭐': 'af',
|
148 |
-
'🇺🇸 🚺
|
149 |
-
'🇺🇸 🚺
|
150 |
-
'🇺🇸 🚺
|
151 |
-
'🇺🇸 🚺
|
152 |
-
'🇺🇸
|
153 |
-
'🇺🇸
|
154 |
-
'
|
155 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
}
|
157 |
VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
|
158 |
|
@@ -233,7 +247,7 @@ with gr.Blocks() as basic_tts:
|
|
233 |
with gr.Column():
|
234 |
text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
|
235 |
with gr.Row():
|
236 |
-
voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='
|
237 |
use_gpu = gr.Dropdown(
|
238 |
USE_GPU_CHOICES,
|
239 |
value='auto' if CUDA_AVAILABLE else False,
|
@@ -401,7 +415,7 @@ with gr.Blocks() as lf_tts:
|
|
401 |
text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
|
402 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
403 |
with gr.Row():
|
404 |
-
voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='
|
405 |
use_gpu = gr.Dropdown(
|
406 |
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
|
407 |
value=CUDA_AVAILABLE,
|
@@ -435,7 +449,7 @@ Kokoro is a frontier TTS model for its size. It has 80 million parameters,<sup>[
|
|
435 |
#### Will this be open sourced?
|
436 |
There currently isn't a release date scheduled for the weights. The inference code in this space is MIT licensed. The architecture was already published by Li et al, with MIT licensed code and pretrained weights.<sup>[2]</sup>
|
437 |
|
438 |
-
#### What is
|
439 |
An unstable voice is more likely to stumble or produce unnatural artifacts, especially on short or strange texts.
|
440 |
|
441 |
#### How can CPU be faster than ZeroGPU?
|
|
|
142 |
def tokenize(ps):
|
143 |
return [i for i in map(VOCAB.get, ps) if i is not None]
|
144 |
|
145 |
+
# Starred voices are more stable
|
146 |
CHOICES = {
|
147 |
'🇺🇸 🚺 American Female ⭐': 'af',
|
148 |
+
'🇺🇸 🚺 Alloy': 'af_alloy',
|
149 |
+
'🇺🇸 🚺 Bella ⭐': 'af_bella',
|
150 |
+
'🇺🇸 🚺 Jessica': 'af_jessica',
|
151 |
+
'🇺🇸 🚺 Nicole ⭐': 'af_nicole',
|
152 |
+
'🇺🇸 🚺 Nova': 'af_nova',
|
153 |
+
'🇺🇸 🚺 River': 'af_river',
|
154 |
+
'🇺🇸 🚺 Sarah ⭐': 'af_sarah',
|
155 |
+
'🇺🇸 🚺 Shimmer': 'af_shimmer',
|
156 |
+
'🇺🇸 🚺 Sky': 'af_sky',
|
157 |
+
'🇺🇸 🚹 Adam': 'am_adam',
|
158 |
+
'🇺🇸 🚹 Echo': 'am_echo',
|
159 |
+
'🇺🇸 🚹 Eric': 'am_eric',
|
160 |
+
'🇺🇸 🚹 Liam': 'am_liam',
|
161 |
+
'🇺🇸 🚹 Michael ⭐': 'am_michael',
|
162 |
+
'🇺🇸 🚹 Onyx': 'am_onyx',
|
163 |
+
'🇬🇧 🚺 Alice': 'bf_alice',
|
164 |
+
'🇬🇧 🚺 Lily': 'bf_lily',
|
165 |
+
'🇬🇧 🚹 Daniel': 'bm_daniel',
|
166 |
+
'🇬🇧 🚹 Fable': 'bm_fable',
|
167 |
+
'🇬🇧 🚹 George': 'bm_george',
|
168 |
+
'🇬🇧 🚹 Lewis': 'bm_lewis',
|
169 |
+
'🇯🇵 🚺 Japanese Female ⭐': 'jf_0',
|
170 |
}
|
171 |
VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
|
172 |
|
|
|
247 |
with gr.Column():
|
248 |
text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
|
249 |
with gr.Row():
|
250 |
+
voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
|
251 |
use_gpu = gr.Dropdown(
|
252 |
USE_GPU_CHOICES,
|
253 |
value='auto' if CUDA_AVAILABLE else False,
|
|
|
415 |
text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
|
416 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
417 |
with gr.Row():
|
418 |
+
voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
|
419 |
use_gpu = gr.Dropdown(
|
420 |
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
|
421 |
value=CUDA_AVAILABLE,
|
|
|
449 |
#### Will this be open sourced?
|
450 |
There currently isn't a release date scheduled for the weights. The inference code in this space is MIT licensed. The architecture was already published by Li et al, with MIT licensed code and pretrained weights.<sup>[2]</sup>
|
451 |
|
452 |
+
#### What is the difference between stable and unstable voices?
|
453 |
An unstable voice is more likely to stumble or produce unnatural artifacts, especially on short or strange texts.
|
454 |
|
455 |
#### How can CPU be faster than ZeroGPU?
|