Spaces:
Running
on
Zero
Running
on
Zero
fixes 500 error for some users
Browse files![image.png](https://cdn-uploads.huggingface.co/production/uploads/62a3bb1cd0d8c2c2169f0b88/Vtx9Pq0tfbEaNam9xU5ra.png)
app.py
CHANGED
@@ -1,446 +1,447 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import base64
|
3 |
-
import tempfile
|
4 |
-
import json
|
5 |
-
import os
|
6 |
-
from os.path import abspath
|
7 |
-
import zipfile
|
8 |
-
import random
|
9 |
-
import xtts
|
10 |
-
import re
|
11 |
-
|
12 |
-
DO_CHECK = os.getenv('DO_CHECK', '1')
|
13 |
-
OUTPUT = "./demo_outputs"
|
14 |
-
cloned_speakers = {}
|
15 |
-
|
16 |
-
print("Preparing file structure...")
|
17 |
-
if not os.path.exists(OUTPUT):
|
18 |
-
os.mkdir(OUTPUT)
|
19 |
-
os.mkdir(os.path.join(OUTPUT, "cloned_speakers"))
|
20 |
-
os.mkdir(os.path.join(OUTPUT, "generated_audios"))
|
21 |
-
elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")):
|
22 |
-
print("Loading existing cloned speakers...")
|
23 |
-
for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")):
|
24 |
-
if file.endswith(".json"):
|
25 |
-
with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp:
|
26 |
-
cloned_speakers[file[:-5]] = json.load(fp)
|
27 |
-
print("Available cloned speakers:", ", ".join(cloned_speakers.keys()))
|
28 |
-
|
29 |
-
AUDIOS_DIR = os.path.join("demo_outputs", "generated_audios");
|
30 |
-
ZIP_DIR = os.path.join("zip_outputs");
|
31 |
-
|
32 |
-
print("Checking zip at", ZIP_DIR)
|
33 |
-
if not os.path.exists(ZIP_DIR):
|
34 |
-
os.mkdir(ZIP_DIR)
|
35 |
-
|
36 |
-
|
37 |
-
try:
|
38 |
-
print("Getting metadata from server ...")
|
39 |
-
LANUGAGES = xtts.get_languages()
|
40 |
-
print("Available languages:", ", ".join(LANUGAGES))
|
41 |
-
STUDIO_SPEAKERS = xtts.get_speakers()
|
42 |
-
print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys()))
|
43 |
-
except:
|
44 |
-
raise Exception("Please make sure the server is running first.")
|
45 |
-
|
46 |
-
|
47 |
-
def ExtractVars(input_string):
|
48 |
-
# Split the string into lines
|
49 |
-
lines = input_string.split('\n')
|
50 |
-
|
51 |
-
# Initialize an empty dictionary to store key-value pairs
|
52 |
-
result_dict = {
|
53 |
-
'prefix': None,
|
54 |
-
'name': '',
|
55 |
-
'speaker': None,
|
56 |
-
'num': None,
|
57 |
-
}
|
58 |
-
|
59 |
-
# List to hold lines that do not start with '!'
|
60 |
-
filtered_lines = []
|
61 |
-
|
62 |
-
# Iterate through each line
|
63 |
-
for line in lines:
|
64 |
-
# Check if the line starts with '!'
|
65 |
-
if line.strip().startswith('!'):
|
66 |
-
|
67 |
-
# Try to split the line into key and value parts
|
68 |
-
try:
|
69 |
-
# Split on '=' and strip whitespace from key and value
|
70 |
-
key, value = line.strip()[1:].split('=')
|
71 |
-
key = key.strip()
|
72 |
-
value = value.strip()
|
73 |
-
# Add to dictionary
|
74 |
-
result_dict[key] = value
|
75 |
-
except ValueError:
|
76 |
-
# Handle the case where there is no '=' or improper format
|
77 |
-
continue
|
78 |
-
elif len(line.strip()) > 0:
|
79 |
-
# Add the line to filtered_lines if it doesn't start with '!'
|
80 |
-
filtered_lines.append(line)
|
81 |
-
|
82 |
-
# Join the filtered lines back into a single string
|
83 |
-
filtered_string = '\n'.join(filtered_lines)
|
84 |
-
return result_dict, filtered_string
|
85 |
-
|
86 |
-
|
87 |
-
def ParsePronucs(PronuncStr):
|
88 |
-
# Split the string into lines
|
89 |
-
lines = PronuncStr.split('\n')
|
90 |
-
|
91 |
-
# Initialize an empty dictionary to store key-value pairs
|
92 |
-
PronuncWords = []
|
93 |
-
|
94 |
-
# Iterate through each line
|
95 |
-
for line in lines:
|
96 |
-
if len(line.strip()) > 0:
|
97 |
-
word,*text = line.strip().split('=',1)
|
98 |
-
word = word.strip()
|
99 |
-
text,*opts = text[0].split("|",1);
|
100 |
-
text = text.strip();
|
101 |
-
|
102 |
-
if len(opts) > 0:
|
103 |
-
opts = opts[0].strip().split(",");
|
104 |
-
else:
|
105 |
-
opts = [];
|
106 |
-
|
107 |
-
|
108 |
-
PronuncWords.append({'word':word, 'text':text, 'opts':opts})
|
109 |
-
|
110 |
-
return PronuncWords
|
111 |
-
|
112 |
-
|
113 |
-
def FindSpeakerByName(name, speakerType):
|
114 |
-
|
115 |
-
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
|
116 |
-
|
117 |
-
for key, value in srcItems.items():
|
118 |
-
|
119 |
-
if key == name:
|
120 |
-
return key,value
|
121 |
-
|
122 |
-
if key.split(" ")[0] == name:
|
123 |
-
return key,value;
|
124 |
-
|
125 |
-
|
126 |
-
def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
|
127 |
-
embeddings = xtts.predict_speaker(upload_file)
|
128 |
-
with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp:
|
129 |
-
json.dump(embeddings, fp)
|
130 |
-
cloned_speakers[clone_speaker_name] = embeddings
|
131 |
-
cloned_speaker_names.append(clone_speaker_name)
|
132 |
-
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
|
133 |
-
|
134 |
-
def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
135 |
-
,speed,top_p,top_k, AllFileList,progress=gr.Progress()
|
136 |
-
):
|
137 |
-
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
|
138 |
-
|
139 |
-
|
140 |
-
# break at line!
|
141 |
-
lines = text.split("---");
|
142 |
-
totalLines = len(lines);
|
143 |
-
print("Total parts:", len(lines))
|
144 |
-
|
145 |
-
audioNum = 0;
|
146 |
-
|
147 |
-
DefaultPrefix = next(tempfile._get_candidate_names());
|
148 |
-
|
149 |
-
CurrentPrefix = DefaultPrefix
|
150 |
-
|
151 |
-
|
152 |
-
# break pronuc
|
153 |
-
Pronuncs = ParsePronucs(pronunc)
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
AudioList = [];
|
159 |
-
for line in progress.tqdm(lines, desc="Gerando fala..."):
|
160 |
-
audioNum += 1;
|
161 |
-
|
162 |
-
textVars,cleanLine = ExtractVars(line)
|
163 |
-
|
164 |
-
if textVars['prefix']:
|
165 |
-
CurrentPrefix = textVars['prefix']
|
166 |
-
|
167 |
-
audioName = textVars['name'];
|
168 |
-
|
169 |
-
if audioName:
|
170 |
-
audioName = '_'+audioName
|
171 |
-
|
172 |
-
num = textVars['num'];
|
173 |
-
|
174 |
-
if not num:
|
175 |
-
num = audioNum;
|
176 |
-
|
177 |
-
path = CurrentPrefix +"_n_" + str(num)+audioName+".wav"
|
178 |
-
|
179 |
-
print("Generating audio for line", num, 'sequence', audioNum);
|
180 |
-
|
181 |
-
speaker = textVars['speaker'];
|
182 |
-
|
183 |
-
if not speaker:
|
184 |
-
speaker = speaker_name_studio if speaker_type == 'Studio' else speaker_name_custom
|
185 |
-
|
186 |
-
speakerName,embeddings = FindSpeakerByName(speaker, speaker_type)
|
187 |
-
|
188 |
-
if not speakerName:
|
189 |
-
raise ValueError("InvalidSpeaker: "+speakerName)
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
FixedText = cleanLine;
|
194 |
-
|
195 |
-
for pronunc in Pronuncs:
|
196 |
-
word = pronunc['word']
|
197 |
-
text = pronunc['text']
|
198 |
-
opts = pronunc['opts'];
|
199 |
-
|
200 |
-
flg = re.IGNORECASE
|
201 |
-
|
202 |
-
if 'cs' in opts:
|
203 |
-
flg = 0;
|
204 |
-
|
205 |
-
FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg)
|
206 |
-
|
207 |
-
ipts = xtts.TTSInputs(
|
208 |
-
speaker_embedding=embeddings["speaker_embedding"],
|
209 |
-
gpt_cond_latent=embeddings["gpt_cond_latent"],
|
210 |
-
text=FixedText,
|
211 |
-
language=lang,
|
212 |
-
temperature=temperature,
|
213 |
-
speed=speed,
|
214 |
-
top_k=top_k,
|
215 |
-
top_p=top_p
|
216 |
-
)
|
217 |
-
|
218 |
-
generated_audio = xtts.predict_speech(ipts)
|
219 |
-
|
220 |
-
print("Audio generated.. Saving to", path);
|
221 |
-
generated_audio_path = os.path.join(AUDIOS_DIR, path)
|
222 |
-
with open(generated_audio_path, "wb") as fp:
|
223 |
-
fp.write(base64.b64decode(generated_audio))
|
224 |
-
AudioList.append(fp.name);
|
225 |
-
|
226 |
-
AllFileList.clear();
|
227 |
-
AllFileList.extend(AudioList);
|
228 |
-
|
229 |
-
return gr.Dropdown(
|
230 |
-
label="Generated Audios",
|
231 |
-
choices=list(AudioList),
|
232 |
-
value=AudioList[0]
|
233 |
-
)
|
234 |
-
|
235 |
-
def get_file_content(f):
|
236 |
-
if len(f) > 0:
|
237 |
-
return f[0];
|
238 |
-
|
239 |
-
return None;
|
240 |
-
|
241 |
-
|
242 |
-
def UpdateFileList(DirListState):
|
243 |
-
DirListState.clear();
|
244 |
-
DirListState.extend( os.listdir(AUDIOS_DIR) )
|
245 |
-
|
246 |
-
def audio_list_update(d):
|
247 |
-
fullPath = abspath(d)
|
248 |
-
return fullPath
|
249 |
-
|
250 |
-
def ZipAndDownload(files):
|
251 |
-
allFiles = files
|
252 |
-
|
253 |
-
DefaultPrefix = next(tempfile._get_candidate_names());
|
254 |
-
|
255 |
-
zipFile = abspath( os.path.join(ZIP_DIR, DefaultPrefix + ".zip") );
|
256 |
-
|
257 |
-
|
258 |
-
with zipfile.ZipFile(zipFile, 'w') as zipMe:
|
259 |
-
for file in allFiles:
|
260 |
-
print("Zipping", file);
|
261 |
-
zipMe.write(abspath(file), os.path.basename(file), compress_type=zipfile.ZIP_DEFLATED)
|
262 |
-
|
263 |
-
print("Pronto", zipFile);
|
264 |
-
|
265 |
-
return '<a href="/file='+zipFile+'">If donwload dont starts, click here</a>';
|
266 |
-
|
267 |
-
|
268 |
-
js = """
|
269 |
-
function DetectDownloadLink(){
|
270 |
-
console.log('Configuring AutoDonwloadObservr...');
|
271 |
-
let hiddenLink = document.getElementById("DonwloadLink");
|
272 |
-
let onChange= function(mutations){
|
273 |
-
|
274 |
-
for (const mutation of mutations) {
|
275 |
-
if (mutation.type !== 'childList')
|
276 |
-
continue;
|
277 |
-
|
278 |
-
for (const addedNode of mutation.addedNodes) {
|
279 |
-
if (addedNode.nodeName === 'A') {
|
280 |
-
location.href = addedNode.href;
|
281 |
-
}
|
282 |
-
}
|
283 |
-
|
284 |
-
}
|
285 |
-
}
|
286 |
-
|
287 |
-
let config = { attributes: true, childList: true, subtree: true, attributeFilter: ["href"] }
|
288 |
-
let obs = new MutationObserver(onChange);
|
289 |
-
obs.observe(hiddenLink, config);
|
290 |
-
}
|
291 |
-
"""
|
292 |
-
|
293 |
-
with gr.Blocks(js=js) as demo:
|
294 |
-
defaultSpeaker = "Dionisio Schuyler"
|
295 |
-
cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
|
296 |
-
AllFileList = gr.State(list([]))
|
297 |
-
|
298 |
-
gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml")
|
299 |
-
|
300 |
-
with gr.Tab("TTS"):
|
301 |
-
with gr.Column() as row4:
|
302 |
-
with gr.Row() as col4:
|
303 |
-
speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio")
|
304 |
-
speaker_name_studio = gr.Dropdown(
|
305 |
-
label="Studio speaker",
|
306 |
-
choices=STUDIO_SPEAKERS.keys(),
|
307 |
-
value=defaultSpeaker if defaultSpeaker in STUDIO_SPEAKERS.keys() else None,
|
308 |
-
)
|
309 |
-
speaker_name_custom = gr.Dropdown(
|
310 |
-
label="Cloned speaker",
|
311 |
-
choices=cloned_speaker_names.value,
|
312 |
-
value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None,
|
313 |
-
)
|
314 |
-
with gr.Accordion("Advanced options", open=False):
|
315 |
-
with gr.Row() as rowAdvanced:
|
316 |
-
temperature = gr.Slider(0.00, 1.00, 0.5, step=0.05, label="Temperature", info="Choose between 0 and 1")
|
317 |
-
top_p = gr.Slider(0.00, 1.00, 0.8, step=0.05, label="TOP P", info="Choose between 0 and 1")
|
318 |
-
top_k = gr.Number(label="TOP K",value=50)
|
319 |
-
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
320 |
-
with gr.Column() as col2:
|
321 |
-
with gr.Row():
|
322 |
-
text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
|
323 |
-
pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
|
324 |
-
with gr.Row():
|
325 |
-
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
|
326 |
-
tts_button = gr.Button(value="TTS")
|
327 |
-
with gr.Column() as col3:
|
328 |
-
# FileList = gr.FileExplorer(
|
329 |
-
# glob="*.wav",
|
330 |
-
# # value=["themes/utils"],
|
331 |
-
# ignore_glob="**/__init__.py",
|
332 |
-
# root_dir=AUDIOS_DIR,
|
333 |
-
# interactive = True,
|
334 |
-
# value=DirectoryList.value
|
335 |
-
# )
|
336 |
-
|
337 |
-
AudioList = gr.Dropdown(
|
338 |
-
label="Generated Audios",
|
339 |
-
choices=[]
|
340 |
-
,interactive=True
|
341 |
-
)
|
342 |
-
|
343 |
-
generated_audio = gr.Audio(label="Audio Play", autoplay=True)
|
344 |
-
AudioList.change(fn=audio_list_update, inputs=[AudioList], outputs=[generated_audio])
|
345 |
-
|
346 |
-
dummyHtml = gr.HTML(elem_id = "DonwloadLink", render = False);
|
347 |
-
downloadAll = gr.DownloadButton("Download All Files")
|
348 |
-
downloadAll.click(ZipAndDownload, inputs=[AllFileList], outputs=[dummyHtml]);
|
349 |
-
dummyHtml.render();
|
350 |
-
|
351 |
-
|
352 |
-
with gr.Tab("Clone a new speaker"):
|
353 |
-
with gr.Column() as col1:
|
354 |
-
upload_file = gr.Audio(label="Upload reference audio", type="filepath")
|
355 |
-
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
|
356 |
-
clone_button = gr.Button(value="Clone speaker")
|
357 |
-
|
358 |
-
|
359 |
-
with gr.Tab("Help"):
|
360 |
-
gr.Markdown("""
|
361 |
-
Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing).
|
362 |
-
|
363 |
-
The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI.
|
364 |
-
|
365 |
-
In this version, we have some customizations that are quite useful.
|
366 |
-
|
367 |
-
# Multiple audios
|
368 |
-
You can generate multiple audios at once by separating the text with three dashes. For example:
|
369 |
-
|
370 |
-
```
|
371 |
-
Text 1
|
372 |
-
---
|
373 |
-
Text 2, line 1
|
374 |
-
Text 2, line 2
|
375 |
-
```
|
376 |
-
|
377 |
-
In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS.
|
378 |
-
You can also specify variables that modify certain aspects.
|
379 |
-
|
380 |
-
For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio.
|
381 |
-
|
382 |
-
List of variables:
|
383 |
-
- `speaker` = name of the speaker
|
384 |
-
- `num` = file number (by default, it's the sequential number)
|
385 |
-
- `prefix` = file name prefix
|
386 |
-
|
387 |
-
# Pronunciation adjustment
|
388 |
-
|
389 |
-
If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations.
|
390 |
-
|
391 |
-
Simply separate them by each line. Example:
|
392 |
-
|
393 |
-
```
|
394 |
-
API = A,P,I
|
395 |
-
SomeFunctionCode = Function Code
|
396 |
-
```
|
397 |
-
|
398 |
-
This is useful for mapping foreign words, abbreviations, acronyms, code, etc.
|
399 |
-
""")
|
400 |
-
|
401 |
-
clone_button.click(
|
402 |
-
fn=clone_speaker,
|
403 |
-
inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
|
404 |
-
outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom],
|
405 |
-
)
|
406 |
-
|
407 |
-
tts_button.click(
|
408 |
-
fn=tts,
|
409 |
-
inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
410 |
-
,speed,top_p,top_k,AllFileList
|
411 |
-
],
|
412 |
-
outputs=[AudioList],
|
413 |
-
)
|
414 |
-
|
415 |
-
if __name__ == "__main__" and DO_CHECK == "1":
|
416 |
-
print("Warming up server... Checking server healthy...")
|
417 |
-
|
418 |
-
speakerName, embs = random.choice(list(STUDIO_SPEAKERS.items()));
|
419 |
-
|
420 |
-
print("Testing with", speakerName);
|
421 |
-
|
422 |
-
ipts = xtts.TTSInputs(
|
423 |
-
speaker_embedding=embs["speaker_embedding"],
|
424 |
-
gpt_cond_latent=embs["gpt_cond_latent"],
|
425 |
-
text="This is a warmup request.",
|
426 |
-
language="en",
|
427 |
-
temperature=0.5,
|
428 |
-
speed=1.0,
|
429 |
-
top_k=50,
|
430 |
-
top_p=0.8
|
431 |
-
)
|
432 |
-
|
433 |
-
resp = xtts.predict_speech(ipts)
|
434 |
-
|
435 |
-
print(" TEST OK")
|
436 |
-
|
437 |
-
|
438 |
-
if __name__ == "__main__":
|
439 |
-
print("STARTING...")
|
440 |
-
demo.launch(
|
441 |
-
share=False,
|
442 |
-
debug=False,
|
443 |
-
server_port=7860,
|
444 |
-
server_name="0.0.0.0",
|
445 |
-
allowed_paths=[ZIP_DIR]
|
446 |
-
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import base64
|
3 |
+
import tempfile
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from os.path import abspath
|
7 |
+
import zipfile
|
8 |
+
import random
|
9 |
+
import xtts
|
10 |
+
import re
|
11 |
+
|
12 |
+
DO_CHECK = os.getenv('DO_CHECK', '1')
|
13 |
+
OUTPUT = "./demo_outputs"
|
14 |
+
cloned_speakers = {}
|
15 |
+
|
16 |
+
print("Preparing file structure...")
|
17 |
+
if not os.path.exists(OUTPUT):
|
18 |
+
os.mkdir(OUTPUT)
|
19 |
+
os.mkdir(os.path.join(OUTPUT, "cloned_speakers"))
|
20 |
+
os.mkdir(os.path.join(OUTPUT, "generated_audios"))
|
21 |
+
elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")):
|
22 |
+
print("Loading existing cloned speakers...")
|
23 |
+
for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")):
|
24 |
+
if file.endswith(".json"):
|
25 |
+
with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp:
|
26 |
+
cloned_speakers[file[:-5]] = json.load(fp)
|
27 |
+
print("Available cloned speakers:", ", ".join(cloned_speakers.keys()))
|
28 |
+
|
29 |
+
AUDIOS_DIR = os.path.join("demo_outputs", "generated_audios");
|
30 |
+
ZIP_DIR = os.path.join("zip_outputs");
|
31 |
+
|
32 |
+
print("Checking zip at", ZIP_DIR)
|
33 |
+
if not os.path.exists(ZIP_DIR):
|
34 |
+
os.mkdir(ZIP_DIR)
|
35 |
+
|
36 |
+
|
37 |
+
try:
|
38 |
+
print("Getting metadata from server ...")
|
39 |
+
LANUGAGES = xtts.get_languages()
|
40 |
+
print("Available languages:", ", ".join(LANUGAGES))
|
41 |
+
STUDIO_SPEAKERS = xtts.get_speakers()
|
42 |
+
print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys()))
|
43 |
+
except:
|
44 |
+
raise Exception("Please make sure the server is running first.")
|
45 |
+
|
46 |
+
|
47 |
+
def ExtractVars(input_string):
|
48 |
+
# Split the string into lines
|
49 |
+
lines = input_string.split('\n')
|
50 |
+
|
51 |
+
# Initialize an empty dictionary to store key-value pairs
|
52 |
+
result_dict = {
|
53 |
+
'prefix': None,
|
54 |
+
'name': '',
|
55 |
+
'speaker': None,
|
56 |
+
'num': None,
|
57 |
+
}
|
58 |
+
|
59 |
+
# List to hold lines that do not start with '!'
|
60 |
+
filtered_lines = []
|
61 |
+
|
62 |
+
# Iterate through each line
|
63 |
+
for line in lines:
|
64 |
+
# Check if the line starts with '!'
|
65 |
+
if line.strip().startswith('!'):
|
66 |
+
|
67 |
+
# Try to split the line into key and value parts
|
68 |
+
try:
|
69 |
+
# Split on '=' and strip whitespace from key and value
|
70 |
+
key, value = line.strip()[1:].split('=')
|
71 |
+
key = key.strip()
|
72 |
+
value = value.strip()
|
73 |
+
# Add to dictionary
|
74 |
+
result_dict[key] = value
|
75 |
+
except ValueError:
|
76 |
+
# Handle the case where there is no '=' or improper format
|
77 |
+
continue
|
78 |
+
elif len(line.strip()) > 0:
|
79 |
+
# Add the line to filtered_lines if it doesn't start with '!'
|
80 |
+
filtered_lines.append(line)
|
81 |
+
|
82 |
+
# Join the filtered lines back into a single string
|
83 |
+
filtered_string = '\n'.join(filtered_lines)
|
84 |
+
return result_dict, filtered_string
|
85 |
+
|
86 |
+
|
87 |
+
def ParsePronucs(PronuncStr):
|
88 |
+
# Split the string into lines
|
89 |
+
lines = PronuncStr.split('\n')
|
90 |
+
|
91 |
+
# Initialize an empty dictionary to store key-value pairs
|
92 |
+
PronuncWords = []
|
93 |
+
|
94 |
+
# Iterate through each line
|
95 |
+
for line in lines:
|
96 |
+
if len(line.strip()) > 0:
|
97 |
+
word,*text = line.strip().split('=',1)
|
98 |
+
word = word.strip()
|
99 |
+
text,*opts = text[0].split("|",1);
|
100 |
+
text = text.strip();
|
101 |
+
|
102 |
+
if len(opts) > 0:
|
103 |
+
opts = opts[0].strip().split(",");
|
104 |
+
else:
|
105 |
+
opts = [];
|
106 |
+
|
107 |
+
|
108 |
+
PronuncWords.append({'word':word, 'text':text, 'opts':opts})
|
109 |
+
|
110 |
+
return PronuncWords
|
111 |
+
|
112 |
+
|
113 |
+
def FindSpeakerByName(name, speakerType):
|
114 |
+
|
115 |
+
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
|
116 |
+
|
117 |
+
for key, value in srcItems.items():
|
118 |
+
|
119 |
+
if key == name:
|
120 |
+
return key,value
|
121 |
+
|
122 |
+
if key.split(" ")[0] == name:
|
123 |
+
return key,value;
|
124 |
+
|
125 |
+
|
126 |
+
def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
|
127 |
+
embeddings = xtts.predict_speaker(upload_file)
|
128 |
+
with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp:
|
129 |
+
json.dump(embeddings, fp)
|
130 |
+
cloned_speakers[clone_speaker_name] = embeddings
|
131 |
+
cloned_speaker_names.append(clone_speaker_name)
|
132 |
+
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
|
133 |
+
|
134 |
+
def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
135 |
+
,speed,top_p,top_k, AllFileList,progress=gr.Progress()
|
136 |
+
):
|
137 |
+
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
|
138 |
+
|
139 |
+
|
140 |
+
# break at line!
|
141 |
+
lines = text.split("---");
|
142 |
+
totalLines = len(lines);
|
143 |
+
print("Total parts:", len(lines))
|
144 |
+
|
145 |
+
audioNum = 0;
|
146 |
+
|
147 |
+
DefaultPrefix = next(tempfile._get_candidate_names());
|
148 |
+
|
149 |
+
CurrentPrefix = DefaultPrefix
|
150 |
+
|
151 |
+
|
152 |
+
# break pronuc
|
153 |
+
Pronuncs = ParsePronucs(pronunc)
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
AudioList = [];
|
159 |
+
for line in progress.tqdm(lines, desc="Gerando fala..."):
|
160 |
+
audioNum += 1;
|
161 |
+
|
162 |
+
textVars,cleanLine = ExtractVars(line)
|
163 |
+
|
164 |
+
if textVars['prefix']:
|
165 |
+
CurrentPrefix = textVars['prefix']
|
166 |
+
|
167 |
+
audioName = textVars['name'];
|
168 |
+
|
169 |
+
if audioName:
|
170 |
+
audioName = '_'+audioName
|
171 |
+
|
172 |
+
num = textVars['num'];
|
173 |
+
|
174 |
+
if not num:
|
175 |
+
num = audioNum;
|
176 |
+
|
177 |
+
path = CurrentPrefix +"_n_" + str(num)+audioName+".wav"
|
178 |
+
|
179 |
+
print("Generating audio for line", num, 'sequence', audioNum);
|
180 |
+
|
181 |
+
speaker = textVars['speaker'];
|
182 |
+
|
183 |
+
if not speaker:
|
184 |
+
speaker = speaker_name_studio if speaker_type == 'Studio' else speaker_name_custom
|
185 |
+
|
186 |
+
speakerName,embeddings = FindSpeakerByName(speaker, speaker_type)
|
187 |
+
|
188 |
+
if not speakerName:
|
189 |
+
raise ValueError("InvalidSpeaker: "+speakerName)
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
FixedText = cleanLine;
|
194 |
+
|
195 |
+
for pronunc in Pronuncs:
|
196 |
+
word = pronunc['word']
|
197 |
+
text = pronunc['text']
|
198 |
+
opts = pronunc['opts'];
|
199 |
+
|
200 |
+
flg = re.IGNORECASE
|
201 |
+
|
202 |
+
if 'cs' in opts:
|
203 |
+
flg = 0;
|
204 |
+
|
205 |
+
FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg)
|
206 |
+
|
207 |
+
ipts = xtts.TTSInputs(
|
208 |
+
speaker_embedding=embeddings["speaker_embedding"],
|
209 |
+
gpt_cond_latent=embeddings["gpt_cond_latent"],
|
210 |
+
text=FixedText,
|
211 |
+
language=lang,
|
212 |
+
temperature=temperature,
|
213 |
+
speed=speed,
|
214 |
+
top_k=top_k,
|
215 |
+
top_p=top_p
|
216 |
+
)
|
217 |
+
|
218 |
+
generated_audio = xtts.predict_speech(ipts)
|
219 |
+
|
220 |
+
print("Audio generated.. Saving to", path);
|
221 |
+
generated_audio_path = os.path.join(AUDIOS_DIR, path)
|
222 |
+
with open(generated_audio_path, "wb") as fp:
|
223 |
+
fp.write(base64.b64decode(generated_audio))
|
224 |
+
AudioList.append(fp.name);
|
225 |
+
|
226 |
+
AllFileList.clear();
|
227 |
+
AllFileList.extend(AudioList);
|
228 |
+
|
229 |
+
return gr.Dropdown(
|
230 |
+
label="Generated Audios",
|
231 |
+
choices=list(AudioList),
|
232 |
+
value=AudioList[0]
|
233 |
+
)
|
234 |
+
|
235 |
+
def get_file_content(f):
|
236 |
+
if len(f) > 0:
|
237 |
+
return f[0];
|
238 |
+
|
239 |
+
return None;
|
240 |
+
|
241 |
+
|
242 |
+
def UpdateFileList(DirListState):
|
243 |
+
DirListState.clear();
|
244 |
+
DirListState.extend( os.listdir(AUDIOS_DIR) )
|
245 |
+
|
246 |
+
def audio_list_update(d):
|
247 |
+
fullPath = abspath(d)
|
248 |
+
return fullPath
|
249 |
+
|
250 |
+
def ZipAndDownload(files):
|
251 |
+
allFiles = files
|
252 |
+
|
253 |
+
DefaultPrefix = next(tempfile._get_candidate_names());
|
254 |
+
|
255 |
+
zipFile = abspath( os.path.join(ZIP_DIR, DefaultPrefix + ".zip") );
|
256 |
+
|
257 |
+
|
258 |
+
with zipfile.ZipFile(zipFile, 'w') as zipMe:
|
259 |
+
for file in allFiles:
|
260 |
+
print("Zipping", file);
|
261 |
+
zipMe.write(abspath(file), os.path.basename(file), compress_type=zipfile.ZIP_DEFLATED)
|
262 |
+
|
263 |
+
print("Pronto", zipFile);
|
264 |
+
|
265 |
+
return '<a href="/file='+zipFile+'">If donwload dont starts, click here</a>';
|
266 |
+
|
267 |
+
|
268 |
+
js = """
|
269 |
+
function DetectDownloadLink(){
|
270 |
+
console.log('Configuring AutoDonwloadObservr...');
|
271 |
+
let hiddenLink = document.getElementById("DonwloadLink");
|
272 |
+
let onChange= function(mutations){
|
273 |
+
|
274 |
+
for (const mutation of mutations) {
|
275 |
+
if (mutation.type !== 'childList')
|
276 |
+
continue;
|
277 |
+
|
278 |
+
for (const addedNode of mutation.addedNodes) {
|
279 |
+
if (addedNode.nodeName === 'A') {
|
280 |
+
location.href = addedNode.href;
|
281 |
+
}
|
282 |
+
}
|
283 |
+
|
284 |
+
}
|
285 |
+
}
|
286 |
+
|
287 |
+
let config = { attributes: true, childList: true, subtree: true, attributeFilter: ["href"] }
|
288 |
+
let obs = new MutationObserver(onChange);
|
289 |
+
obs.observe(hiddenLink, config);
|
290 |
+
}
|
291 |
+
"""
|
292 |
+
|
293 |
+
with gr.Blocks(js=js) as demo:
|
294 |
+
defaultSpeaker = "Dionisio Schuyler"
|
295 |
+
cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
|
296 |
+
AllFileList = gr.State(list([]))
|
297 |
+
|
298 |
+
gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml")
|
299 |
+
|
300 |
+
with gr.Tab("TTS"):
|
301 |
+
with gr.Column() as row4:
|
302 |
+
with gr.Row() as col4:
|
303 |
+
speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio")
|
304 |
+
speaker_name_studio = gr.Dropdown(
|
305 |
+
label="Studio speaker",
|
306 |
+
choices=STUDIO_SPEAKERS.keys(),
|
307 |
+
value=defaultSpeaker if defaultSpeaker in STUDIO_SPEAKERS.keys() else None,
|
308 |
+
)
|
309 |
+
speaker_name_custom = gr.Dropdown(
|
310 |
+
label="Cloned speaker",
|
311 |
+
choices=cloned_speaker_names.value,
|
312 |
+
value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None,
|
313 |
+
)
|
314 |
+
with gr.Accordion("Advanced options", open=False):
|
315 |
+
with gr.Row() as rowAdvanced:
|
316 |
+
temperature = gr.Slider(0.00, 1.00, 0.5, step=0.05, label="Temperature", info="Choose between 0 and 1")
|
317 |
+
top_p = gr.Slider(0.00, 1.00, 0.8, step=0.05, label="TOP P", info="Choose between 0 and 1")
|
318 |
+
top_k = gr.Number(label="TOP K",value=50)
|
319 |
+
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
320 |
+
with gr.Column() as col2:
|
321 |
+
with gr.Row():
|
322 |
+
text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
|
323 |
+
pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
|
324 |
+
with gr.Row():
|
325 |
+
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
|
326 |
+
tts_button = gr.Button(value="TTS")
|
327 |
+
with gr.Column() as col3:
|
328 |
+
# FileList = gr.FileExplorer(
|
329 |
+
# glob="*.wav",
|
330 |
+
# # value=["themes/utils"],
|
331 |
+
# ignore_glob="**/__init__.py",
|
332 |
+
# root_dir=AUDIOS_DIR,
|
333 |
+
# interactive = True,
|
334 |
+
# value=DirectoryList.value
|
335 |
+
# )
|
336 |
+
|
337 |
+
AudioList = gr.Dropdown(
|
338 |
+
label="Generated Audios",
|
339 |
+
choices=[]
|
340 |
+
,interactive=True
|
341 |
+
)
|
342 |
+
|
343 |
+
generated_audio = gr.Audio(label="Audio Play", autoplay=True)
|
344 |
+
AudioList.change(fn=audio_list_update, inputs=[AudioList], outputs=[generated_audio])
|
345 |
+
|
346 |
+
dummyHtml = gr.HTML(elem_id = "DonwloadLink", render = False);
|
347 |
+
downloadAll = gr.DownloadButton("Download All Files")
|
348 |
+
downloadAll.click(ZipAndDownload, inputs=[AllFileList], outputs=[dummyHtml]);
|
349 |
+
dummyHtml.render();
|
350 |
+
|
351 |
+
|
352 |
+
with gr.Tab("Clone a new speaker"):
|
353 |
+
with gr.Column() as col1:
|
354 |
+
upload_file = gr.Audio(label="Upload reference audio", type="filepath")
|
355 |
+
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
|
356 |
+
clone_button = gr.Button(value="Clone speaker")
|
357 |
+
|
358 |
+
|
359 |
+
with gr.Tab("Help"):
|
360 |
+
gr.Markdown("""
|
361 |
+
Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing).
|
362 |
+
|
363 |
+
The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI.
|
364 |
+
|
365 |
+
In this version, we have some customizations that are quite useful.
|
366 |
+
|
367 |
+
# Multiple audios
|
368 |
+
You can generate multiple audios at once by separating the text with three dashes. For example:
|
369 |
+
|
370 |
+
```
|
371 |
+
Text 1
|
372 |
+
---
|
373 |
+
Text 2, line 1
|
374 |
+
Text 2, line 2
|
375 |
+
```
|
376 |
+
|
377 |
+
In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS.
|
378 |
+
You can also specify variables that modify certain aspects.
|
379 |
+
|
380 |
+
For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio.
|
381 |
+
|
382 |
+
List of variables:
|
383 |
+
- `speaker` = name of the speaker
|
384 |
+
- `num` = file number (by default, it's the sequential number)
|
385 |
+
- `prefix` = file name prefix
|
386 |
+
|
387 |
+
# Pronunciation adjustment
|
388 |
+
|
389 |
+
If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations.
|
390 |
+
|
391 |
+
Simply separate them by each line. Example:
|
392 |
+
|
393 |
+
```
|
394 |
+
API = A,P,I
|
395 |
+
SomeFunctionCode = Function Code
|
396 |
+
```
|
397 |
+
|
398 |
+
This is useful for mapping foreign words, abbreviations, acronyms, code, etc.
|
399 |
+
""")
|
400 |
+
|
401 |
+
clone_button.click(
|
402 |
+
fn=clone_speaker,
|
403 |
+
inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
|
404 |
+
outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom],
|
405 |
+
)
|
406 |
+
|
407 |
+
tts_button.click(
|
408 |
+
fn=tts,
|
409 |
+
inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
410 |
+
,speed,top_p,top_k,AllFileList
|
411 |
+
],
|
412 |
+
outputs=[AudioList],
|
413 |
+
)
|
414 |
+
|
415 |
+
if __name__ == "__main__" and DO_CHECK == "1":
|
416 |
+
print("Warming up server... Checking server healthy...")
|
417 |
+
|
418 |
+
speakerName, embs = random.choice(list(STUDIO_SPEAKERS.items()));
|
419 |
+
|
420 |
+
print("Testing with", speakerName);
|
421 |
+
|
422 |
+
ipts = xtts.TTSInputs(
|
423 |
+
speaker_embedding=embs["speaker_embedding"],
|
424 |
+
gpt_cond_latent=embs["gpt_cond_latent"],
|
425 |
+
text="This is a warmup request.",
|
426 |
+
language="en",
|
427 |
+
temperature=0.5,
|
428 |
+
speed=1.0,
|
429 |
+
top_k=50,
|
430 |
+
top_p=0.8
|
431 |
+
)
|
432 |
+
|
433 |
+
resp = xtts.predict_speech(ipts)
|
434 |
+
|
435 |
+
print(" TEST OK")
|
436 |
+
|
437 |
+
|
438 |
+
if __name__ == "__main__":
|
439 |
+
print("STARTING...")
|
440 |
+
demo.launch(
|
441 |
+
share=False,
|
442 |
+
debug=False,
|
443 |
+
server_port=7860,
|
444 |
+
server_name="0.0.0.0",
|
445 |
+
allowed_paths=[ZIP_DIR],
|
446 |
+
ssr_mode=False
|
447 |
+
)
|