Flux9665 commited on
Commit
43f2732
·
verified ·
1 Parent(s): 0083a15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -16
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import torch.cuda
 
3
 
4
  from InferenceInterfaces.ControllableInterface import ControllableInterface
5
  from Utility.utils import float2pcm
@@ -8,13 +9,31 @@ from Utility.utils import load_json_from_path
8
 
9
  class TTSWebUI:
10
 
11
- def __init__(self, gpu_id="cpu", title="ToucanTTS in 7000 Languages", article="Check out the toolkit at https://github.com/DigitalPhonetics/IMS-Toucan", available_artificial_voices=1000, path_to_iso_list="Preprocessing/multilinguality/iso_to_fullname.json"):
 
 
 
 
 
 
 
 
 
12
  iso_to_name = load_json_from_path(path_to_iso_list)
13
  text_selection = [f"{iso_to_name[iso_code]} ({iso_code})" for iso_code in iso_to_name]
14
  # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]
 
 
 
 
 
 
15
 
16
  self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
17
- available_artificial_voices=available_artificial_voices)
 
 
 
18
  self.iface = gr.Interface(fn=self.read,
19
  inputs=[gr.Textbox(lines=2,
20
  placeholder="write what you want the synthesis to read here...",
@@ -24,16 +43,14 @@ class TTSWebUI:
24
  type="value",
25
  value='English (eng)',
26
  label="Select the Language of the Text (type on your keyboard to find it quickly)"),
27
- gr.Audio(type="filepath", show_label=True, container=True, label="Voice to Clone (if left empty, will use an artificial voice instead)"),
28
- gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
29
- value=279,
30
- label="Random Seed for the artificial Voice"),
31
- gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.1, label="Prosody Creativity"),
32
- gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
33
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
34
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
35
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
36
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
37
  ],
38
  outputs=[gr.Audio(type="numpy", label="Speech"),
39
  gr.Image(label="Visualization")],
@@ -46,14 +63,14 @@ class TTSWebUI:
46
  def read(self,
47
  prompt,
48
  language,
49
- reference_audio,
50
- voice_seed,
51
  prosody_creativity,
52
  duration_scaling_factor,
 
 
 
53
  # pitch_variance_scale,
54
  # energy_variance_scale,
55
- emb1,
56
- emb2
57
  ):
58
  sr, wav, fig = self.controllable_ui.read(prompt,
59
  reference_audio,
@@ -66,12 +83,12 @@ class TTSWebUI:
66
  1.0,
67
  1.0,
68
  emb1,
69
- emb2,
70
  0.,
71
  0.,
72
  0.,
73
  0.,
74
- -18.)
 
75
  return (sr, float2pcm(wav)), fig
76
 
77
 
 
1
  import gradio as gr
2
  import torch.cuda
3
+ from huggingface_hub import hf_hub_download
4
 
5
  from InferenceInterfaces.ControllableInterface import ControllableInterface
6
  from Utility.utils import float2pcm
 
9
 
10
  class TTSWebUI:
11
 
12
+ def __init__(self,
13
+ gpu_id="cpu",
14
+ title="Controllable Text-to-Speech for over 7000 Languages",
15
+ article="",
16
+ tts_model_path=None,
17
+ vocoder_model_path=None,
18
+ embedding_gan_path=None,
19
+ available_artificial_voices=50 # be careful with this, if you want too many, it might lead to an endless loop
20
+ ):
21
+ path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
22
  iso_to_name = load_json_from_path(path_to_iso_list)
23
  text_selection = [f"{iso_to_name[iso_code]} ({iso_code})" for iso_code in iso_to_name]
24
  # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]
25
+ if tts_model_path is None:
26
+ tts_model_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt")
27
+ if vocoder_model_path is None:
28
+ vocoder_model_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="Vocoder.pt")
29
+ if embedding_gan_path is None:
30
+ embedding_gan_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="embedding_gan.pt")
31
 
32
  self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
33
+ available_artificial_voices=available_artificial_voices,
34
+ tts_model_path=tts_model_path,
35
+ vocoder_model_path=vocoder_model_path,
36
+ embedding_gan_path=embedding_gan_path)
37
  self.iface = gr.Interface(fn=self.read,
38
  inputs=[gr.Textbox(lines=2,
39
  placeholder="write what you want the synthesis to read here...",
 
43
  type="value",
44
  value='English (eng)',
45
  label="Select the Language of the Text (type on your keyboard to find it quickly)"),
46
+ gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
47
+ gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Faster - Slower"),
48
+ gr.Slider(minimum=0, maximum=available_artificial_voices, step=1, value=27, label="Random Seed for the artificial Voice"),
49
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Gender of artificial Voice"),
50
+ gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
 
51
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
52
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
53
+ # gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
 
54
  ],
55
  outputs=[gr.Audio(type="numpy", label="Speech"),
56
  gr.Image(label="Visualization")],
 
63
  def read(self,
64
  prompt,
65
  language,
 
 
66
  prosody_creativity,
67
  duration_scaling_factor,
68
+ voice_seed,
69
+ emb1,
70
+ reference_audio,
71
  # pitch_variance_scale,
72
  # energy_variance_scale,
73
+ # emb2
 
74
  ):
75
  sr, wav, fig = self.controllable_ui.read(prompt,
76
  reference_audio,
 
83
  1.0,
84
  1.0,
85
  emb1,
 
86
  0.,
87
  0.,
88
  0.,
89
  0.,
90
+ 0.,
91
+ -12.)
92
  return (sr, float2pcm(wav)), fig
93
 
94