hexgrad commited on
Commit
8fdc8fc
·
verified ·
1 Parent(s): bbacd9b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -12
app.py CHANGED
@@ -142,17 +142,31 @@ VOCAB = get_vocab()
142
  def tokenize(ps):
143
  return [i for i in map(VOCAB.get, ps) if i is not None]
144
 
145
- # voices are stable, 🧪 voices are unstable
146
  CHOICES = {
147
  '🇺🇸 🚺 American Female ⭐': 'af',
148
- '🇺🇸 🚺 Bella': 'af_bella',
149
- '🇺🇸 🚺 Nicole': 'af_nicole',
150
- '🇺🇸 🚺 Sarah': 'af_sarah',
151
- '🇺🇸 🚺 Sky 🧪': 'af_sky',
152
- '🇺🇸 🚹 Adam 🧪': 'am_adam',
153
- '🇺🇸 🚹 Michael': 'am_michael',
154
- '🇬🇧 🚹 Lewis 🧪': 'bm_lewis',
155
- '🇯🇵 🚺 Japanese Female': 'jf_0',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  }
157
  VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
158
 
@@ -233,7 +247,7 @@ with gr.Blocks() as basic_tts:
233
  with gr.Column():
234
  text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
235
  with gr.Row():
236
- voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info=' voices are stable, 🧪 voices are unstable')
237
  use_gpu = gr.Dropdown(
238
  USE_GPU_CHOICES,
239
  value='auto' if CUDA_AVAILABLE else False,
@@ -401,7 +415,7 @@ with gr.Blocks() as lf_tts:
401
  text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
402
  file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
403
  with gr.Row():
404
- voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info=' voices are stable, 🧪 voices are unstable')
405
  use_gpu = gr.Dropdown(
406
  [('ZeroGPU 🚀', True), ('CPU 🐌', False)],
407
  value=CUDA_AVAILABLE,
@@ -435,7 +449,7 @@ Kokoro is a frontier TTS model for its size. It has 80 million parameters,<sup>[
435
  #### Will this be open sourced?
436
  There currently isn't a release date scheduled for the weights. The inference code in this space is MIT licensed. The architecture was already published by Li et al, with MIT licensed code and pretrained weights.<sup>[2]</sup>
437
 
438
- #### What is an unstable voice?
439
  An unstable voice is more likely to stumble or produce unnatural artifacts, especially on short or strange texts.
440
 
441
  #### How can CPU be faster than ZeroGPU?
 
142
  def tokenize(ps):
143
  return [i for i in map(VOCAB.get, ps) if i is not None]
144
 
145
+ # Starred voices are more stable
146
  CHOICES = {
147
  '🇺🇸 🚺 American Female ⭐': 'af',
148
+ '🇺🇸 🚺 Alloy': 'af_alloy',
149
+ '🇺🇸 🚺 Bella ⭐': 'af_bella',
150
+ '🇺🇸 🚺 Jessica': 'af_jessica',
151
+ '🇺🇸 🚺 Nicole ': 'af_nicole',
152
+ '🇺🇸 🚺 Nova': 'af_nova',
153
+ '🇺🇸 🚺 River': 'af_river',
154
+ '🇺🇸 🚺 Sarah ': 'af_sarah',
155
+ '🇺🇸 🚺 Shimmer': 'af_shimmer',
156
+ '🇺🇸 🚺 Sky': 'af_sky',
157
+ '🇺🇸 🚹 Adam': 'am_adam',
158
+ '🇺🇸 🚹 Echo': 'am_echo',
159
+ '🇺🇸 🚹 Eric': 'am_eric',
160
+ '🇺🇸 🚹 Liam': 'am_liam',
161
+ '🇺🇸 🚹 Michael ⭐': 'am_michael',
162
+ '🇺🇸 🚹 Onyx': 'am_onyx',
163
+ '🇬🇧 🚺 Alice': 'bf_alice',
164
+ '🇬🇧 🚺 Lily': 'bf_lily',
165
+ '🇬🇧 🚹 Daniel': 'bm_daniel',
166
+ '🇬🇧 🚹 Fable': 'bm_fable',
167
+ '🇬🇧 🚹 George': 'bm_george',
168
+ '🇬🇧 🚹 Lewis': 'bm_lewis',
169
+ '🇯🇵 🚺 Japanese Female ⭐': 'jf_0',
170
  }
171
  VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()} for device in models}
172
 
 
247
  with gr.Column():
248
  text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
249
  with gr.Row():
250
+ voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
251
  use_gpu = gr.Dropdown(
252
  USE_GPU_CHOICES,
253
  value='auto' if CUDA_AVAILABLE else False,
 
415
  text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
416
  file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
417
  with gr.Row():
418
+ voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
419
  use_gpu = gr.Dropdown(
420
  [('ZeroGPU 🚀', True), ('CPU 🐌', False)],
421
  value=CUDA_AVAILABLE,
 
449
  #### Will this be open sourced?
450
  There currently isn't a release date scheduled for the weights. The inference code in this space is MIT licensed. The architecture was already published by Li et al, with MIT licensed code and pretrained weights.<sup>[2]</sup>
451
 
452
+ #### What is the difference between stable and unstable voices?
453
  An unstable voice is more likely to stumble or produce unnatural artifacts, especially on short or strange texts.
454
 
455
  #### How can CPU be faster than ZeroGPU?