Tonic commited on
Commit
f4158c0
·
verified ·
1 Parent(s): ac9a77d

fixes 500 error for some users

Browse files

![image.png](https://cdn-uploads.huggingface.co/production/uploads/62a3bb1cd0d8c2c2169f0b88/Vtx9Pq0tfbEaNam9xU5ra.png)

Files changed (1) hide show
  1. app.py +447 -446
app.py CHANGED
@@ -1,446 +1,447 @@
1
- import gradio as gr
2
- import base64
3
- import tempfile
4
- import json
5
- import os
6
- from os.path import abspath
7
- import zipfile
8
- import random
9
- import xtts
10
- import re
11
-
12
- DO_CHECK = os.getenv('DO_CHECK', '1')
13
- OUTPUT = "./demo_outputs"
14
- cloned_speakers = {}
15
-
16
- print("Preparing file structure...")
17
- if not os.path.exists(OUTPUT):
18
- os.mkdir(OUTPUT)
19
- os.mkdir(os.path.join(OUTPUT, "cloned_speakers"))
20
- os.mkdir(os.path.join(OUTPUT, "generated_audios"))
21
- elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")):
22
- print("Loading existing cloned speakers...")
23
- for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")):
24
- if file.endswith(".json"):
25
- with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp:
26
- cloned_speakers[file[:-5]] = json.load(fp)
27
- print("Available cloned speakers:", ", ".join(cloned_speakers.keys()))
28
-
29
- AUDIOS_DIR = os.path.join("demo_outputs", "generated_audios");
30
- ZIP_DIR = os.path.join("zip_outputs");
31
-
32
- print("Checking zip at", ZIP_DIR)
33
- if not os.path.exists(ZIP_DIR):
34
- os.mkdir(ZIP_DIR)
35
-
36
-
37
- try:
38
- print("Getting metadata from server ...")
39
- LANUGAGES = xtts.get_languages()
40
- print("Available languages:", ", ".join(LANUGAGES))
41
- STUDIO_SPEAKERS = xtts.get_speakers()
42
- print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys()))
43
- except:
44
- raise Exception("Please make sure the server is running first.")
45
-
46
-
47
- def ExtractVars(input_string):
48
- # Split the string into lines
49
- lines = input_string.split('\n')
50
-
51
- # Initialize an empty dictionary to store key-value pairs
52
- result_dict = {
53
- 'prefix': None,
54
- 'name': '',
55
- 'speaker': None,
56
- 'num': None,
57
- }
58
-
59
- # List to hold lines that do not start with '!'
60
- filtered_lines = []
61
-
62
- # Iterate through each line
63
- for line in lines:
64
- # Check if the line starts with '!'
65
- if line.strip().startswith('!'):
66
-
67
- # Try to split the line into key and value parts
68
- try:
69
- # Split on '=' and strip whitespace from key and value
70
- key, value = line.strip()[1:].split('=')
71
- key = key.strip()
72
- value = value.strip()
73
- # Add to dictionary
74
- result_dict[key] = value
75
- except ValueError:
76
- # Handle the case where there is no '=' or improper format
77
- continue
78
- elif len(line.strip()) > 0:
79
- # Add the line to filtered_lines if it doesn't start with '!'
80
- filtered_lines.append(line)
81
-
82
- # Join the filtered lines back into a single string
83
- filtered_string = '\n'.join(filtered_lines)
84
- return result_dict, filtered_string
85
-
86
-
87
- def ParsePronucs(PronuncStr):
88
- # Split the string into lines
89
- lines = PronuncStr.split('\n')
90
-
91
- # Initialize an empty dictionary to store key-value pairs
92
- PronuncWords = []
93
-
94
- # Iterate through each line
95
- for line in lines:
96
- if len(line.strip()) > 0:
97
- word,*text = line.strip().split('=',1)
98
- word = word.strip()
99
- text,*opts = text[0].split("|",1);
100
- text = text.strip();
101
-
102
- if len(opts) > 0:
103
- opts = opts[0].strip().split(",");
104
- else:
105
- opts = [];
106
-
107
-
108
- PronuncWords.append({'word':word, 'text':text, 'opts':opts})
109
-
110
- return PronuncWords
111
-
112
-
113
- def FindSpeakerByName(name, speakerType):
114
-
115
- srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
116
-
117
- for key, value in srcItems.items():
118
-
119
- if key == name:
120
- return key,value
121
-
122
- if key.split(" ")[0] == name:
123
- return key,value;
124
-
125
-
126
- def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
127
- embeddings = xtts.predict_speaker(upload_file)
128
- with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp:
129
- json.dump(embeddings, fp)
130
- cloned_speakers[clone_speaker_name] = embeddings
131
- cloned_speaker_names.append(clone_speaker_name)
132
- return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
133
-
134
- def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
135
- ,speed,top_p,top_k, AllFileList,progress=gr.Progress()
136
- ):
137
- embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
138
-
139
-
140
- # break at line!
141
- lines = text.split("---");
142
- totalLines = len(lines);
143
- print("Total parts:", len(lines))
144
-
145
- audioNum = 0;
146
-
147
- DefaultPrefix = next(tempfile._get_candidate_names());
148
-
149
- CurrentPrefix = DefaultPrefix
150
-
151
-
152
- # break pronuc
153
- Pronuncs = ParsePronucs(pronunc)
154
-
155
-
156
-
157
-
158
- AudioList = [];
159
- for line in progress.tqdm(lines, desc="Gerando fala..."):
160
- audioNum += 1;
161
-
162
- textVars,cleanLine = ExtractVars(line)
163
-
164
- if textVars['prefix']:
165
- CurrentPrefix = textVars['prefix']
166
-
167
- audioName = textVars['name'];
168
-
169
- if audioName:
170
- audioName = '_'+audioName
171
-
172
- num = textVars['num'];
173
-
174
- if not num:
175
- num = audioNum;
176
-
177
- path = CurrentPrefix +"_n_" + str(num)+audioName+".wav"
178
-
179
- print("Generating audio for line", num, 'sequence', audioNum);
180
-
181
- speaker = textVars['speaker'];
182
-
183
- if not speaker:
184
- speaker = speaker_name_studio if speaker_type == 'Studio' else speaker_name_custom
185
-
186
- speakerName,embeddings = FindSpeakerByName(speaker, speaker_type)
187
-
188
- if not speakerName:
189
- raise ValueError("InvalidSpeaker: "+speakerName)
190
-
191
-
192
-
193
- FixedText = cleanLine;
194
-
195
- for pronunc in Pronuncs:
196
- word = pronunc['word']
197
- text = pronunc['text']
198
- opts = pronunc['opts'];
199
-
200
- flg = re.IGNORECASE
201
-
202
- if 'cs' in opts:
203
- flg = 0;
204
-
205
- FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg)
206
-
207
- ipts = xtts.TTSInputs(
208
- speaker_embedding=embeddings["speaker_embedding"],
209
- gpt_cond_latent=embeddings["gpt_cond_latent"],
210
- text=FixedText,
211
- language=lang,
212
- temperature=temperature,
213
- speed=speed,
214
- top_k=top_k,
215
- top_p=top_p
216
- )
217
-
218
- generated_audio = xtts.predict_speech(ipts)
219
-
220
- print("Audio generated.. Saving to", path);
221
- generated_audio_path = os.path.join(AUDIOS_DIR, path)
222
- with open(generated_audio_path, "wb") as fp:
223
- fp.write(base64.b64decode(generated_audio))
224
- AudioList.append(fp.name);
225
-
226
- AllFileList.clear();
227
- AllFileList.extend(AudioList);
228
-
229
- return gr.Dropdown(
230
- label="Generated Audios",
231
- choices=list(AudioList),
232
- value=AudioList[0]
233
- )
234
-
235
- def get_file_content(f):
236
- if len(f) > 0:
237
- return f[0];
238
-
239
- return None;
240
-
241
-
242
- def UpdateFileList(DirListState):
243
- DirListState.clear();
244
- DirListState.extend( os.listdir(AUDIOS_DIR) )
245
-
246
- def audio_list_update(d):
247
- fullPath = abspath(d)
248
- return fullPath
249
-
250
- def ZipAndDownload(files):
251
- allFiles = files
252
-
253
- DefaultPrefix = next(tempfile._get_candidate_names());
254
-
255
- zipFile = abspath( os.path.join(ZIP_DIR, DefaultPrefix + ".zip") );
256
-
257
-
258
- with zipfile.ZipFile(zipFile, 'w') as zipMe:
259
- for file in allFiles:
260
- print("Zipping", file);
261
- zipMe.write(abspath(file), os.path.basename(file), compress_type=zipfile.ZIP_DEFLATED)
262
-
263
- print("Pronto", zipFile);
264
-
265
- return '<a href="/file='+zipFile+'">If donwload dont starts, click here</a>';
266
-
267
-
268
- js = """
269
- function DetectDownloadLink(){
270
- console.log('Configuring AutoDonwloadObservr...');
271
- let hiddenLink = document.getElementById("DonwloadLink");
272
- let onChange= function(mutations){
273
-
274
- for (const mutation of mutations) {
275
- if (mutation.type !== 'childList')
276
- continue;
277
-
278
- for (const addedNode of mutation.addedNodes) {
279
- if (addedNode.nodeName === 'A') {
280
- location.href = addedNode.href;
281
- }
282
- }
283
-
284
- }
285
- }
286
-
287
- let config = { attributes: true, childList: true, subtree: true, attributeFilter: ["href"] }
288
- let obs = new MutationObserver(onChange);
289
- obs.observe(hiddenLink, config);
290
- }
291
- """
292
-
293
- with gr.Blocks(js=js) as demo:
294
- defaultSpeaker = "Dionisio Schuyler"
295
- cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
296
- AllFileList = gr.State(list([]))
297
-
298
- gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml")
299
-
300
- with gr.Tab("TTS"):
301
- with gr.Column() as row4:
302
- with gr.Row() as col4:
303
- speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio")
304
- speaker_name_studio = gr.Dropdown(
305
- label="Studio speaker",
306
- choices=STUDIO_SPEAKERS.keys(),
307
- value=defaultSpeaker if defaultSpeaker in STUDIO_SPEAKERS.keys() else None,
308
- )
309
- speaker_name_custom = gr.Dropdown(
310
- label="Cloned speaker",
311
- choices=cloned_speaker_names.value,
312
- value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None,
313
- )
314
- with gr.Accordion("Advanced options", open=False):
315
- with gr.Row() as rowAdvanced:
316
- temperature = gr.Slider(0.00, 1.00, 0.5, step=0.05, label="Temperature", info="Choose between 0 and 1")
317
- top_p = gr.Slider(0.00, 1.00, 0.8, step=0.05, label="TOP P", info="Choose between 0 and 1")
318
- top_k = gr.Number(label="TOP K",value=50)
319
- speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
320
- with gr.Column() as col2:
321
- with gr.Row():
322
- text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
323
- pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
324
- with gr.Row():
325
- lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
326
- tts_button = gr.Button(value="TTS")
327
- with gr.Column() as col3:
328
- # FileList = gr.FileExplorer(
329
- # glob="*.wav",
330
- # # value=["themes/utils"],
331
- # ignore_glob="**/__init__.py",
332
- # root_dir=AUDIOS_DIR,
333
- # interactive = True,
334
- # value=DirectoryList.value
335
- # )
336
-
337
- AudioList = gr.Dropdown(
338
- label="Generated Audios",
339
- choices=[]
340
- ,interactive=True
341
- )
342
-
343
- generated_audio = gr.Audio(label="Audio Play", autoplay=True)
344
- AudioList.change(fn=audio_list_update, inputs=[AudioList], outputs=[generated_audio])
345
-
346
- dummyHtml = gr.HTML(elem_id = "DonwloadLink", render = False);
347
- downloadAll = gr.DownloadButton("Download All Files")
348
- downloadAll.click(ZipAndDownload, inputs=[AllFileList], outputs=[dummyHtml]);
349
- dummyHtml.render();
350
-
351
-
352
- with gr.Tab("Clone a new speaker"):
353
- with gr.Column() as col1:
354
- upload_file = gr.Audio(label="Upload reference audio", type="filepath")
355
- clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
356
- clone_button = gr.Button(value="Clone speaker")
357
-
358
-
359
- with gr.Tab("Help"):
360
- gr.Markdown("""
361
- Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing).
362
-
363
- The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI.
364
-
365
- In this version, we have some customizations that are quite useful.
366
-
367
- # Multiple audios
368
- You can generate multiple audios at once by separating the text with three dashes. For example:
369
-
370
- ```
371
- Text 1
372
- ---
373
- Text 2, line 1
374
- Text 2, line 2
375
- ```
376
-
377
- In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS.
378
- You can also specify variables that modify certain aspects.
379
-
380
- For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio.
381
-
382
- List of variables:
383
- - `speaker` = name of the speaker
384
- - `num` = file number (by default, it's the sequential number)
385
- - `prefix` = file name prefix
386
-
387
- # Pronunciation adjustment
388
-
389
- If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations.
390
-
391
- Simply separate them by each line. Example:
392
-
393
- ```
394
- API = A,P,I
395
- SomeFunctionCode = Function Code
396
- ```
397
-
398
- This is useful for mapping foreign words, abbreviations, acronyms, code, etc.
399
- """)
400
-
401
- clone_button.click(
402
- fn=clone_speaker,
403
- inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
404
- outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom],
405
- )
406
-
407
- tts_button.click(
408
- fn=tts,
409
- inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
410
- ,speed,top_p,top_k,AllFileList
411
- ],
412
- outputs=[AudioList],
413
- )
414
-
415
- if __name__ == "__main__" and DO_CHECK == "1":
416
- print("Warming up server... Checking server healthy...")
417
-
418
- speakerName, embs = random.choice(list(STUDIO_SPEAKERS.items()));
419
-
420
- print("Testing with", speakerName);
421
-
422
- ipts = xtts.TTSInputs(
423
- speaker_embedding=embs["speaker_embedding"],
424
- gpt_cond_latent=embs["gpt_cond_latent"],
425
- text="This is a warmup request.",
426
- language="en",
427
- temperature=0.5,
428
- speed=1.0,
429
- top_k=50,
430
- top_p=0.8
431
- )
432
-
433
- resp = xtts.predict_speech(ipts)
434
-
435
- print(" TEST OK")
436
-
437
-
438
- if __name__ == "__main__":
439
- print("STARTING...")
440
- demo.launch(
441
- share=False,
442
- debug=False,
443
- server_port=7860,
444
- server_name="0.0.0.0",
445
- allowed_paths=[ZIP_DIR]
446
- )
 
 
1
+ import gradio as gr
2
+ import base64
3
+ import tempfile
4
+ import json
5
+ import os
6
+ from os.path import abspath
7
+ import zipfile
8
+ import random
9
+ import xtts
10
+ import re
11
+
12
+ DO_CHECK = os.getenv('DO_CHECK', '1')
13
+ OUTPUT = "./demo_outputs"
14
+ cloned_speakers = {}
15
+
16
+ print("Preparing file structure...")
17
+ if not os.path.exists(OUTPUT):
18
+ os.mkdir(OUTPUT)
19
+ os.mkdir(os.path.join(OUTPUT, "cloned_speakers"))
20
+ os.mkdir(os.path.join(OUTPUT, "generated_audios"))
21
+ elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")):
22
+ print("Loading existing cloned speakers...")
23
+ for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")):
24
+ if file.endswith(".json"):
25
+ with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp:
26
+ cloned_speakers[file[:-5]] = json.load(fp)
27
+ print("Available cloned speakers:", ", ".join(cloned_speakers.keys()))
28
+
29
+ AUDIOS_DIR = os.path.join("demo_outputs", "generated_audios");
30
+ ZIP_DIR = os.path.join("zip_outputs");
31
+
32
+ print("Checking zip at", ZIP_DIR)
33
+ if not os.path.exists(ZIP_DIR):
34
+ os.mkdir(ZIP_DIR)
35
+
36
+
37
+ try:
38
+ print("Getting metadata from server ...")
39
+ LANUGAGES = xtts.get_languages()
40
+ print("Available languages:", ", ".join(LANUGAGES))
41
+ STUDIO_SPEAKERS = xtts.get_speakers()
42
+ print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys()))
43
+ except:
44
+ raise Exception("Please make sure the server is running first.")
45
+
46
+
47
+ def ExtractVars(input_string):
48
+ # Split the string into lines
49
+ lines = input_string.split('\n')
50
+
51
+ # Initialize an empty dictionary to store key-value pairs
52
+ result_dict = {
53
+ 'prefix': None,
54
+ 'name': '',
55
+ 'speaker': None,
56
+ 'num': None,
57
+ }
58
+
59
+ # List to hold lines that do not start with '!'
60
+ filtered_lines = []
61
+
62
+ # Iterate through each line
63
+ for line in lines:
64
+ # Check if the line starts with '!'
65
+ if line.strip().startswith('!'):
66
+
67
+ # Try to split the line into key and value parts
68
+ try:
69
+ # Split on '=' and strip whitespace from key and value
70
+ key, value = line.strip()[1:].split('=')
71
+ key = key.strip()
72
+ value = value.strip()
73
+ # Add to dictionary
74
+ result_dict[key] = value
75
+ except ValueError:
76
+ # Handle the case where there is no '=' or improper format
77
+ continue
78
+ elif len(line.strip()) > 0:
79
+ # Add the line to filtered_lines if it doesn't start with '!'
80
+ filtered_lines.append(line)
81
+
82
+ # Join the filtered lines back into a single string
83
+ filtered_string = '\n'.join(filtered_lines)
84
+ return result_dict, filtered_string
85
+
86
+
87
+ def ParsePronucs(PronuncStr):
88
+ # Split the string into lines
89
+ lines = PronuncStr.split('\n')
90
+
91
+ # Initialize an empty dictionary to store key-value pairs
92
+ PronuncWords = []
93
+
94
+ # Iterate through each line
95
+ for line in lines:
96
+ if len(line.strip()) > 0:
97
+ word,*text = line.strip().split('=',1)
98
+ word = word.strip()
99
+ text,*opts = text[0].split("|",1);
100
+ text = text.strip();
101
+
102
+ if len(opts) > 0:
103
+ opts = opts[0].strip().split(",");
104
+ else:
105
+ opts = [];
106
+
107
+
108
+ PronuncWords.append({'word':word, 'text':text, 'opts':opts})
109
+
110
+ return PronuncWords
111
+
112
+
113
+ def FindSpeakerByName(name, speakerType):
114
+
115
+ srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
116
+
117
+ for key, value in srcItems.items():
118
+
119
+ if key == name:
120
+ return key,value
121
+
122
+ if key.split(" ")[0] == name:
123
+ return key,value;
124
+
125
+
126
+ def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
127
+ embeddings = xtts.predict_speaker(upload_file)
128
+ with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp:
129
+ json.dump(embeddings, fp)
130
+ cloned_speakers[clone_speaker_name] = embeddings
131
+ cloned_speaker_names.append(clone_speaker_name)
132
+ return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
133
+
134
+ def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
135
+ ,speed,top_p,top_k, AllFileList,progress=gr.Progress()
136
+ ):
137
+ embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
138
+
139
+
140
+ # break at line!
141
+ lines = text.split("---");
142
+ totalLines = len(lines);
143
+ print("Total parts:", len(lines))
144
+
145
+ audioNum = 0;
146
+
147
+ DefaultPrefix = next(tempfile._get_candidate_names());
148
+
149
+ CurrentPrefix = DefaultPrefix
150
+
151
+
152
+ # break pronuc
153
+ Pronuncs = ParsePronucs(pronunc)
154
+
155
+
156
+
157
+
158
+ AudioList = [];
159
+ for line in progress.tqdm(lines, desc="Gerando fala..."):
160
+ audioNum += 1;
161
+
162
+ textVars,cleanLine = ExtractVars(line)
163
+
164
+ if textVars['prefix']:
165
+ CurrentPrefix = textVars['prefix']
166
+
167
+ audioName = textVars['name'];
168
+
169
+ if audioName:
170
+ audioName = '_'+audioName
171
+
172
+ num = textVars['num'];
173
+
174
+ if not num:
175
+ num = audioNum;
176
+
177
+ path = CurrentPrefix +"_n_" + str(num)+audioName+".wav"
178
+
179
+ print("Generating audio for line", num, 'sequence', audioNum);
180
+
181
+ speaker = textVars['speaker'];
182
+
183
+ if not speaker:
184
+ speaker = speaker_name_studio if speaker_type == 'Studio' else speaker_name_custom
185
+
186
+ speakerName,embeddings = FindSpeakerByName(speaker, speaker_type)
187
+
188
+ if not speakerName:
189
+ raise ValueError("InvalidSpeaker: "+speakerName)
190
+
191
+
192
+
193
+ FixedText = cleanLine;
194
+
195
+ for pronunc in Pronuncs:
196
+ word = pronunc['word']
197
+ text = pronunc['text']
198
+ opts = pronunc['opts'];
199
+
200
+ flg = re.IGNORECASE
201
+
202
+ if 'cs' in opts:
203
+ flg = 0;
204
+
205
+ FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg)
206
+
207
+ ipts = xtts.TTSInputs(
208
+ speaker_embedding=embeddings["speaker_embedding"],
209
+ gpt_cond_latent=embeddings["gpt_cond_latent"],
210
+ text=FixedText,
211
+ language=lang,
212
+ temperature=temperature,
213
+ speed=speed,
214
+ top_k=top_k,
215
+ top_p=top_p
216
+ )
217
+
218
+ generated_audio = xtts.predict_speech(ipts)
219
+
220
+ print("Audio generated.. Saving to", path);
221
+ generated_audio_path = os.path.join(AUDIOS_DIR, path)
222
+ with open(generated_audio_path, "wb") as fp:
223
+ fp.write(base64.b64decode(generated_audio))
224
+ AudioList.append(fp.name);
225
+
226
+ AllFileList.clear();
227
+ AllFileList.extend(AudioList);
228
+
229
+ return gr.Dropdown(
230
+ label="Generated Audios",
231
+ choices=list(AudioList),
232
+ value=AudioList[0]
233
+ )
234
+
235
+ def get_file_content(f):
236
+ if len(f) > 0:
237
+ return f[0];
238
+
239
+ return None;
240
+
241
+
242
+ def UpdateFileList(DirListState):
243
+ DirListState.clear();
244
+ DirListState.extend( os.listdir(AUDIOS_DIR) )
245
+
246
+ def audio_list_update(d):
247
+ fullPath = abspath(d)
248
+ return fullPath
249
+
250
+ def ZipAndDownload(files):
251
+ allFiles = files
252
+
253
+ DefaultPrefix = next(tempfile._get_candidate_names());
254
+
255
+ zipFile = abspath( os.path.join(ZIP_DIR, DefaultPrefix + ".zip") );
256
+
257
+
258
+ with zipfile.ZipFile(zipFile, 'w') as zipMe:
259
+ for file in allFiles:
260
+ print("Zipping", file);
261
+ zipMe.write(abspath(file), os.path.basename(file), compress_type=zipfile.ZIP_DEFLATED)
262
+
263
+ print("Pronto", zipFile);
264
+
265
+ return '<a href="/file='+zipFile+'">If donwload dont starts, click here</a>';
266
+
267
+
268
+ js = """
269
+ function DetectDownloadLink(){
270
+ console.log('Configuring AutoDonwloadObservr...');
271
+ let hiddenLink = document.getElementById("DonwloadLink");
272
+ let onChange= function(mutations){
273
+
274
+ for (const mutation of mutations) {
275
+ if (mutation.type !== 'childList')
276
+ continue;
277
+
278
+ for (const addedNode of mutation.addedNodes) {
279
+ if (addedNode.nodeName === 'A') {
280
+ location.href = addedNode.href;
281
+ }
282
+ }
283
+
284
+ }
285
+ }
286
+
287
+ let config = { attributes: true, childList: true, subtree: true, attributeFilter: ["href"] }
288
+ let obs = new MutationObserver(onChange);
289
+ obs.observe(hiddenLink, config);
290
+ }
291
+ """
292
+
293
+ with gr.Blocks(js=js) as demo:
294
+ defaultSpeaker = "Dionisio Schuyler"
295
+ cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
296
+ AllFileList = gr.State(list([]))
297
+
298
+ gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml")
299
+
300
+ with gr.Tab("TTS"):
301
+ with gr.Column() as row4:
302
+ with gr.Row() as col4:
303
+ speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio")
304
+ speaker_name_studio = gr.Dropdown(
305
+ label="Studio speaker",
306
+ choices=STUDIO_SPEAKERS.keys(),
307
+ value=defaultSpeaker if defaultSpeaker in STUDIO_SPEAKERS.keys() else None,
308
+ )
309
+ speaker_name_custom = gr.Dropdown(
310
+ label="Cloned speaker",
311
+ choices=cloned_speaker_names.value,
312
+ value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None,
313
+ )
314
+ with gr.Accordion("Advanced options", open=False):
315
+ with gr.Row() as rowAdvanced:
316
+ temperature = gr.Slider(0.00, 1.00, 0.5, step=0.05, label="Temperature", info="Choose between 0 and 1")
317
+ top_p = gr.Slider(0.00, 1.00, 0.8, step=0.05, label="TOP P", info="Choose between 0 and 1")
318
+ top_k = gr.Number(label="TOP K",value=50)
319
+ speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
320
+ with gr.Column() as col2:
321
+ with gr.Row():
322
+ text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
323
+ pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
324
+ with gr.Row():
325
+ lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
326
+ tts_button = gr.Button(value="TTS")
327
+ with gr.Column() as col3:
328
+ # FileList = gr.FileExplorer(
329
+ # glob="*.wav",
330
+ # # value=["themes/utils"],
331
+ # ignore_glob="**/__init__.py",
332
+ # root_dir=AUDIOS_DIR,
333
+ # interactive = True,
334
+ # value=DirectoryList.value
335
+ # )
336
+
337
+ AudioList = gr.Dropdown(
338
+ label="Generated Audios",
339
+ choices=[]
340
+ ,interactive=True
341
+ )
342
+
343
+ generated_audio = gr.Audio(label="Audio Play", autoplay=True)
344
+ AudioList.change(fn=audio_list_update, inputs=[AudioList], outputs=[generated_audio])
345
+
346
+ dummyHtml = gr.HTML(elem_id = "DonwloadLink", render = False);
347
+ downloadAll = gr.DownloadButton("Download All Files")
348
+ downloadAll.click(ZipAndDownload, inputs=[AllFileList], outputs=[dummyHtml]);
349
+ dummyHtml.render();
350
+
351
+
352
+ with gr.Tab("Clone a new speaker"):
353
+ with gr.Column() as col1:
354
+ upload_file = gr.Audio(label="Upload reference audio", type="filepath")
355
+ clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
356
+ clone_button = gr.Button(value="Clone speaker")
357
+
358
+
359
+ with gr.Tab("Help"):
360
+ gr.Markdown("""
361
+ Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing).
362
+
363
+ The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI.
364
+
365
+ In this version, we have some customizations that are quite useful.
366
+
367
+ # Multiple audios
368
+ You can generate multiple audios at once by separating the text with three dashes. For example:
369
+
370
+ ```
371
+ Text 1
372
+ ---
373
+ Text 2, line 1
374
+ Text 2, line 2
375
+ ```
376
+
377
+ In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS.
378
+ You can also specify variables that modify certain aspects.
379
+
380
+ For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio.
381
+
382
+ List of variables:
383
+ - `speaker` = name of the speaker
384
+ - `num` = file number (by default, it's the sequential number)
385
+ - `prefix` = file name prefix
386
+
387
+ # Pronunciation adjustment
388
+
389
+ If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations.
390
+
391
+ Simply separate them by each line. Example:
392
+
393
+ ```
394
+ API = A,P,I
395
+ SomeFunctionCode = Function Code
396
+ ```
397
+
398
+ This is useful for mapping foreign words, abbreviations, acronyms, code, etc.
399
+ """)
400
+
401
+ clone_button.click(
402
+ fn=clone_speaker,
403
+ inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
404
+ outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom],
405
+ )
406
+
407
+ tts_button.click(
408
+ fn=tts,
409
+ inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
410
+ ,speed,top_p,top_k,AllFileList
411
+ ],
412
+ outputs=[AudioList],
413
+ )
414
+
415
+ if __name__ == "__main__" and DO_CHECK == "1":
416
+ print("Warming up server... Checking server healthy...")
417
+
418
+ speakerName, embs = random.choice(list(STUDIO_SPEAKERS.items()));
419
+
420
+ print("Testing with", speakerName);
421
+
422
+ ipts = xtts.TTSInputs(
423
+ speaker_embedding=embs["speaker_embedding"],
424
+ gpt_cond_latent=embs["gpt_cond_latent"],
425
+ text="This is a warmup request.",
426
+ language="en",
427
+ temperature=0.5,
428
+ speed=1.0,
429
+ top_k=50,
430
+ top_p=0.8
431
+ )
432
+
433
+ resp = xtts.predict_speech(ipts)
434
+
435
+ print(" TEST OK")
436
+
437
+
438
+ if __name__ == "__main__":
439
+ print("STARTING...")
440
+ demo.launch(
441
+ share=False,
442
+ debug=False,
443
+ server_port=7860,
444
+ server_name="0.0.0.0",
445
+ allowed_paths=[ZIP_DIR],
446
+ ssr_mode=False
447
+ )