Spaces:
Running
on
Zero
Running
on
Zero
License and minor adjusts
Browse files- Dockerfile +2 -0
- app.py +103 -7
Dockerfile
CHANGED
@@ -13,6 +13,8 @@ RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
|
|
13 |
RUN python -m unidic download
|
14 |
RUN mkdir -p /app/tts_models
|
15 |
|
|
|
|
|
16 |
COPY xtts.py .
|
17 |
COPY app.py .
|
18 |
|
|
|
13 |
RUN python -m unidic download
|
14 |
RUN mkdir -p /app/tts_models
|
15 |
|
16 |
+
RUN python -m pip install spaces
|
17 |
+
|
18 |
COPY xtts.py .
|
19 |
COPY app.py .
|
20 |
|
app.py
CHANGED
@@ -7,7 +7,7 @@ from os.path import abspath
|
|
7 |
import zipfile
|
8 |
import random
|
9 |
import xtts
|
10 |
-
|
11 |
|
12 |
DO_CHECK = os.getenv('DO_CHECK', '1')
|
13 |
OUTPUT = "./demo_outputs"
|
@@ -84,6 +84,32 @@ def ExtractVars(input_string):
|
|
84 |
return result_dict, filtered_string
|
85 |
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
def FindSpeakerByName(name, speakerType):
|
88 |
|
89 |
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
|
@@ -105,11 +131,12 @@ def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
|
|
105 |
cloned_speaker_names.append(clone_speaker_name)
|
106 |
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
|
107 |
|
108 |
-
def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
109 |
,speed,top_p,top_k, AllFileList,progress=gr.Progress()
|
110 |
):
|
111 |
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
|
112 |
|
|
|
113 |
# break at line!
|
114 |
lines = text.split("---");
|
115 |
totalLines = len(lines);
|
@@ -122,6 +149,12 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temp
|
|
122 |
CurrentPrefix = DefaultPrefix
|
123 |
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
AudioList = [];
|
126 |
for line in progress.tqdm(lines, desc="Gerando fala..."):
|
127 |
audioNum += 1;
|
@@ -154,11 +187,27 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temp
|
|
154 |
|
155 |
if not speakerName:
|
156 |
raise ValueError("InvalidSpeaker: "+speakerName)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
ipts = xtts.TTSInputs(
|
159 |
speaker_embedding=embeddings["speaker_embedding"],
|
160 |
gpt_cond_latent=embeddings["gpt_cond_latent"],
|
161 |
-
text=
|
162 |
language=lang,
|
163 |
temperature=temperature,
|
164 |
speed=speed,
|
@@ -246,6 +295,7 @@ with gr.Blocks(js=js) as demo:
|
|
246 |
cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
|
247 |
AllFileList = gr.State(list([]))
|
248 |
|
|
|
249 |
|
250 |
with gr.Tab("TTS"):
|
251 |
with gr.Column() as row4:
|
@@ -268,9 +318,12 @@ with gr.Blocks(js=js) as demo:
|
|
268 |
top_k = gr.Number(label="TOP K",value=50)
|
269 |
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
270 |
with gr.Column() as col2:
|
271 |
-
|
272 |
-
|
273 |
-
|
|
|
|
|
|
|
274 |
with gr.Column() as col3:
|
275 |
# FileList = gr.FileExplorer(
|
276 |
# glob="*.wav",
|
@@ -302,6 +355,49 @@ with gr.Blocks(js=js) as demo:
|
|
302 |
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
|
303 |
clone_button = gr.Button(value="Clone speaker")
|
304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
clone_button.click(
|
306 |
fn=clone_speaker,
|
307 |
inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
|
@@ -310,7 +406,7 @@ with gr.Blocks(js=js) as demo:
|
|
310 |
|
311 |
tts_button.click(
|
312 |
fn=tts,
|
313 |
-
inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
314 |
,speed,top_p,top_k,AllFileList
|
315 |
],
|
316 |
outputs=[AudioList],
|
|
|
7 |
import zipfile
|
8 |
import random
|
9 |
import xtts
|
10 |
+
import re
|
11 |
|
12 |
DO_CHECK = os.getenv('DO_CHECK', '1')
|
13 |
OUTPUT = "./demo_outputs"
|
|
|
84 |
return result_dict, filtered_string
|
85 |
|
86 |
|
87 |
+
def ParsePronucs(PronuncStr):
|
88 |
+
# Split the string into lines
|
89 |
+
lines = PronuncStr.split('\n')
|
90 |
+
|
91 |
+
# Initialize an empty dictionary to store key-value pairs
|
92 |
+
PronuncWords = []
|
93 |
+
|
94 |
+
# Iterate through each line
|
95 |
+
for line in lines:
|
96 |
+
if len(line.strip()) > 0:
|
97 |
+
word,*text = line.strip().split('=',1)
|
98 |
+
word = word.strip()
|
99 |
+
text,*opts = text[0].split("|",1);
|
100 |
+
text = text.strip();
|
101 |
+
|
102 |
+
if len(opts) > 0:
|
103 |
+
opts = opts[0].strip().split(",");
|
104 |
+
else:
|
105 |
+
opts = [];
|
106 |
+
|
107 |
+
|
108 |
+
PronuncWords.append({'word':word, 'text':text, 'opts':opts})
|
109 |
+
|
110 |
+
return PronuncWords
|
111 |
+
|
112 |
+
|
113 |
def FindSpeakerByName(name, speakerType):
|
114 |
|
115 |
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
|
|
|
131 |
cloned_speaker_names.append(clone_speaker_name)
|
132 |
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
|
133 |
|
134 |
+
def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
135 |
,speed,top_p,top_k, AllFileList,progress=gr.Progress()
|
136 |
):
|
137 |
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
|
138 |
|
139 |
+
|
140 |
# break at line!
|
141 |
lines = text.split("---");
|
142 |
totalLines = len(lines);
|
|
|
149 |
CurrentPrefix = DefaultPrefix
|
150 |
|
151 |
|
152 |
+
# break pronuc
|
153 |
+
Pronuncs = ParsePronucs(pronunc)
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
AudioList = [];
|
159 |
for line in progress.tqdm(lines, desc="Gerando fala..."):
|
160 |
audioNum += 1;
|
|
|
187 |
|
188 |
if not speakerName:
|
189 |
raise ValueError("InvalidSpeaker: "+speakerName)
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
FixedText = cleanLine;
|
194 |
+
|
195 |
+
for pronunc in Pronuncs:
|
196 |
+
word = pronunc['word']
|
197 |
+
text = pronunc['text']
|
198 |
+
opts = pronunc['opts'];
|
199 |
+
|
200 |
+
flg = re.IGNORECASE
|
201 |
+
|
202 |
+
if 'cs' in opts:
|
203 |
+
flg = 0;
|
204 |
+
|
205 |
+
FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg)
|
206 |
|
207 |
ipts = xtts.TTSInputs(
|
208 |
speaker_embedding=embeddings["speaker_embedding"],
|
209 |
gpt_cond_latent=embeddings["gpt_cond_latent"],
|
210 |
+
text=FixedText,
|
211 |
language=lang,
|
212 |
temperature=temperature,
|
213 |
speed=speed,
|
|
|
295 |
cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
|
296 |
AllFileList = gr.State(list([]))
|
297 |
|
298 |
+
gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml")
|
299 |
|
300 |
with gr.Tab("TTS"):
|
301 |
with gr.Column() as row4:
|
|
|
318 |
top_k = gr.Number(label="TOP K",value=50)
|
319 |
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
320 |
with gr.Column() as col2:
|
321 |
+
with gr.Row():
|
322 |
+
text = gr.Textbox(label="text",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
|
323 |
+
pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
|
324 |
+
with gr.Row():
|
325 |
+
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
|
326 |
+
tts_button = gr.Button(value="TTS")
|
327 |
with gr.Column() as col3:
|
328 |
# FileList = gr.FileExplorer(
|
329 |
# glob="*.wav",
|
|
|
355 |
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
|
356 |
clone_button = gr.Button(value="Clone speaker")
|
357 |
|
358 |
+
|
359 |
+
with gr.Tab("Help"):
|
360 |
+
gr.Markdown("""
|
361 |
+
Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing).
|
362 |
+
|
363 |
+
The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI.
|
364 |
+
|
365 |
+
In this version, we have some customizations that are quite useful.
|
366 |
+
|
367 |
+
# Multiple audios
|
368 |
+
You can generate multiple audios at once by separating the text with three dashes. For example:
|
369 |
+
|
370 |
+
```
|
371 |
+
Text 1
|
372 |
+
---
|
373 |
+
Text 2, line 1
|
374 |
+
Text 2, line 2
|
375 |
+
```
|
376 |
+
|
377 |
+
In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS.
|
378 |
+
You can also specify variables that modify certain aspects.
|
379 |
+
|
380 |
+
For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio.
|
381 |
+
|
382 |
+
List of variables:
|
383 |
+
- `speaker` = name of the speaker
|
384 |
+
- `num` = file number (by default, it's the sequential number)
|
385 |
+
- `prefix` = file name prefix
|
386 |
+
|
387 |
+
# Pronunciation adjustment
|
388 |
+
|
389 |
+
If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations.
|
390 |
+
|
391 |
+
Simply separate them by each line. Example:
|
392 |
+
|
393 |
+
```
|
394 |
+
API = A,P,I
|
395 |
+
SomeFunctionCode = Function Code
|
396 |
+
```
|
397 |
+
|
398 |
+
This is useful for mapping foreign words, abbreviations, acronyms, code, etc.
|
399 |
+
""")
|
400 |
+
|
401 |
clone_button.click(
|
402 |
fn=clone_speaker,
|
403 |
inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
|
|
|
406 |
|
407 |
tts_button.click(
|
408 |
fn=tts,
|
409 |
+
inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
410 |
,speed,top_p,top_k,AllFileList
|
411 |
],
|
412 |
outputs=[AudioList],
|