zyingt commited on
Commit
b07c30e
·
verified ·
1 Parent(s): 8f90878

support timbre confusion

Browse files
Files changed (1) hide show
  1. app.py +118 -28
app.py CHANGED
@@ -6,13 +6,7 @@
6
  import subprocess
7
 
8
  command_to_run = "cd ./modules/monotonic_align;mkdir -p monotonic_align;python setup.py build_ext --inplace;cd /home/user/app"
9
-
10
- try:
11
- result = subprocess.check_output(command_to_run, shell=True, text=True)
12
- print("Command output:")
13
- print(result)
14
- except subprocess.CalledProcessError as e:
15
- print(f"Command failed with return code {e.returncode}")
16
 
17
  import gradio as gr
18
  import os
@@ -28,22 +22,25 @@ SUPPORTED_SPEAKERS = {
28
  "Helen Taylor":"hifitts_9136",
29
  "Sylviamb":"hifitts_11614",
30
  "Celine Major":"hifitts_11697",
31
- "LikeManyWaters":"hifitts_12787"
32
  }
33
 
34
 
35
  def tts_inference(
36
  input_text,
37
- target_speaker
 
38
  ):
39
  ### Target Speaker ###
40
  target_speaker = SUPPORTED_SPEAKERS[target_speaker]
41
 
42
  args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"]
43
  args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"]
44
- args_list += ["--speaker_name", target_speaker]
 
45
  args_list += ["--text", input_text]
46
  args_list += ["--mode","single"]
 
47
  args_list += ["--output_dir", "result"]
48
  args_list += ["--log_level", "debug"]
49
 
@@ -56,17 +53,49 @@ def tts_inference(
56
  )
57
  return result_file
58
 
59
- gr.Markdown(
60
- """
61
- # Amphion Text to Speech (TTS)
62
- [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/usercenter/Amphion)
63
- This demo provides an Amphion TTS pretrained model (VITS) for you to play.
64
- """
65
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- demo_inputs = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  gr.Textbox(
69
- label="Input text",
70
  type="text",
71
  placeholder="Type something here.."
72
  ),
@@ -74,19 +103,80 @@ demo_inputs = [
74
  choices=list(SUPPORTED_SPEAKERS.keys()),
75
  label="Target Speaker",
76
  value="Cori Samuel"
 
 
 
 
 
 
 
 
77
  )
78
  ]
79
 
80
- demo_output = gr.Audio(label="")
81
 
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- demo = gr.Interface(
85
- fn=tts_inference,
86
- inputs=demo_inputs,
87
- outputs=demo_output,
88
- title="Amphion Text-to-Speech",
89
- )
90
 
91
- if __name__ == "__main__":
92
- demo.launch(share=True)
 
6
  import subprocess
7
 
8
  command_to_run = "cd ./modules/monotonic_align;mkdir -p monotonic_align;python setup.py build_ext --inplace;cd /home/user/app"
9
+ subprocess.check_output(command_to_run, shell=True, text=True)
 
 
 
 
 
 
10
 
11
  import gradio as gr
12
  import os
 
22
  "Helen Taylor":"hifitts_9136",
23
  "Sylviamb":"hifitts_11614",
24
  "Celine Major":"hifitts_11697",
25
+ "LikeManyWaters":"hifitts_12787"
26
  }
27
 
28
 
29
  def tts_inference(
30
  input_text,
31
+ target_speaker,
32
+ duration
33
  ):
34
  ### Target Speaker ###
35
  target_speaker = SUPPORTED_SPEAKERS[target_speaker]
36
 
37
  args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"]
38
  args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"]
39
+ args_list += ["--speaker_name_1", target_speaker]
40
+ args_list += ["--speaker_name_2", None]
41
  args_list += ["--text", input_text]
42
  args_list += ["--mode","single"]
43
+ args_list += ["--duration_control",str(float(duration))]
44
  args_list += ["--output_dir", "result"]
45
  args_list += ["--log_level", "debug"]
46
 
 
53
  )
54
  return result_file
55
 
56
+ def tc_inference(
57
+ input_text,
58
+ target_speaker_1,
59
+ target_speaker_2,
60
+ confusion_degree,
61
+ duration
62
+ ):
63
+ ### Target Speaker ###
64
+ target_speaker_1 = SUPPORTED_SPEAKERS[target_speaker_1]
65
+ if target_speaker_2 is not None:
66
+ target_speaker_2 = SUPPORTED_SPEAKERS[target_speaker_2]
67
+
68
+ args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"]
69
+ args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"]
70
+ args_list += ["--speaker_name_1", target_speaker_1]
71
+ args_list += ["--speaker_name_2", target_speaker_2]
72
+ args_list += ["--alpha", str(float(confusion_degree))]
73
+ args_list += ["--text", input_text]
74
+ args_list += ["--mode","single"]
75
+ args_list += ["--duration_control",str(float(duration))]
76
+ args_list += ["--output_dir", "result"]
77
+ args_list += ["--log_level", "debug"]
78
 
79
+ os.environ["WORK_DIR"] = "./"
80
+ inference.main(args_list)
81
+
82
+ ### Display ###
83
+ source_speaker_1 = os.path.join(
84
+ "result/single/s1.wav"
85
+ )
86
+ source_speaker_2 = os.path.join(
87
+ "result/single/s2.wav"
88
+ )
89
+ result_file = os.path.join(
90
+ "result/single/test_pred.wav"
91
+ )
92
+
93
+ return source_speaker_1, source_speaker_2, result_file
94
+
95
+ # Section 1: TTS
96
+ tts_demo_inputs = [
97
  gr.Textbox(
98
+ label="Input Text",
99
  type="text",
100
  placeholder="Type something here.."
101
  ),
 
103
  choices=list(SUPPORTED_SPEAKERS.keys()),
104
  label="Target Speaker",
105
  value="Cori Samuel"
106
+ ),
107
+ gr.Slider(
108
+ 1,
109
+ 5,
110
+ value=1,
111
+ step=0.25,
112
+ label="Speaking Rate",
113
+ info="As the step number increases, the speaking rate will be slower.",
114
  )
115
  ]
116
 
117
+ tts_demo_output = gr.Audio(label="Generated Speech")
118
 
119
 
120
+ # Section 2: Timbre confusion
121
+ tc_demo_inputs = [
122
+ gr.Textbox(
123
+ label="Input Text",
124
+ type="text",
125
+ placeholder="Type something here.."
126
+ ),
127
+ gr.Radio(
128
+ choices=list(SUPPORTED_SPEAKERS.keys()),
129
+ label="Target Speaker 1",
130
+ value="Cori Samuel"
131
+ ),
132
+ gr.Radio(
133
+ choices=list(SUPPORTED_SPEAKERS.keys()),
134
+ label="Target Speaker 2",
135
+ value="Phil Benson"
136
+ ),
137
+ gr.Slider(
138
+ 0,
139
+ 1,
140
+ value=0.5,
141
+ step=0.1,
142
+ label="Confusion Degree",
143
+ info="As the step number increases, the generated voice will be more similar to speaker 2.",
144
+ ),
145
+ gr.Slider(
146
+ 1,
147
+ 5,
148
+ value=1,
149
+ step=0.25,
150
+ label="Speaking Rate",
151
+ info="As the step number increases, the speaking rate will be slower.",
152
+ )
153
+ ]
154
+
155
+ tc_demo_outputs = [
156
+ gr.Audio(label="Target Speaker 1"),
157
+ gr.Audio(label="Target Speaker 2"),
158
+ gr.Audio(label="Interpolated Speech")
159
+ ]
160
+
161
+
162
+
163
+ with gr.Blocks() as demo:
164
+ gr.Interface(
165
+ fn=tts_inference,
166
+ inputs=tts_demo_inputs,
167
+ outputs=tts_demo_output,
168
+ title="Amphion Text-to-Speech",
169
+ )
170
+
171
+ gr.Interface(
172
+ fn=tc_inference,
173
+ inputs=tc_demo_inputs,
174
+ outputs=tc_demo_outputs,
175
+ title="Timbre Confusion",
176
+ )
177
 
178
+ demo.queue()
179
+ demo.launch()
 
 
 
 
180
 
181
+ # if __name__ == "__main__":
182
+ # demo.launch(share=True)