rrg92 commited on
Commit
0a08964
·
1 Parent(s): 592c43f

License and minor adjusts

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -0
  2. app.py +103 -7
Dockerfile CHANGED
@@ -13,6 +13,8 @@ RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
13
  RUN python -m unidic download
14
  RUN mkdir -p /app/tts_models
15
 
 
 
16
  COPY xtts.py .
17
  COPY app.py .
18
 
 
13
  RUN python -m unidic download
14
  RUN mkdir -p /app/tts_models
15
 
16
+ RUN python -m pip install spaces
17
+
18
  COPY xtts.py .
19
  COPY app.py .
20
 
app.py CHANGED
@@ -7,7 +7,7 @@ from os.path import abspath
7
  import zipfile
8
  import random
9
  import xtts
10
-
11
 
12
  DO_CHECK = os.getenv('DO_CHECK', '1')
13
  OUTPUT = "./demo_outputs"
@@ -84,6 +84,32 @@ def ExtractVars(input_string):
84
  return result_dict, filtered_string
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  def FindSpeakerByName(name, speakerType):
88
 
89
  srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
@@ -105,11 +131,12 @@ def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
105
  cloned_speaker_names.append(clone_speaker_name)
106
  return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
107
 
108
- def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
109
  ,speed,top_p,top_k, AllFileList,progress=gr.Progress()
110
  ):
111
  embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
112
 
 
113
  # break at line!
114
  lines = text.split("---");
115
  totalLines = len(lines);
@@ -122,6 +149,12 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temp
122
  CurrentPrefix = DefaultPrefix
123
 
124
 
 
 
 
 
 
 
125
  AudioList = [];
126
  for line in progress.tqdm(lines, desc="Gerando fala..."):
127
  audioNum += 1;
@@ -154,11 +187,27 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temp
154
 
155
  if not speakerName:
156
  raise ValueError("InvalidSpeaker: "+speakerName)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  ipts = xtts.TTSInputs(
159
  speaker_embedding=embeddings["speaker_embedding"],
160
  gpt_cond_latent=embeddings["gpt_cond_latent"],
161
- text=cleanLine,
162
  language=lang,
163
  temperature=temperature,
164
  speed=speed,
@@ -246,6 +295,7 @@ with gr.Blocks(js=js) as demo:
246
  cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
247
  AllFileList = gr.State(list([]))
248
 
 
249
 
250
  with gr.Tab("TTS"):
251
  with gr.Column() as row4:
@@ -268,9 +318,12 @@ with gr.Blocks(js=js) as demo:
268
  top_k = gr.Number(label="TOP K",value=50)
269
  speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
270
  with gr.Column() as col2:
271
- lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
272
- text = gr.Textbox(label="text",lines=4, value="A quick brown fox jumps over the lazy dog.")
273
- tts_button = gr.Button(value="TTS")
 
 
 
274
  with gr.Column() as col3:
275
  # FileList = gr.FileExplorer(
276
  # glob="*.wav",
@@ -302,6 +355,49 @@ with gr.Blocks(js=js) as demo:
302
  clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
303
  clone_button = gr.Button(value="Clone speaker")
304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  clone_button.click(
306
  fn=clone_speaker,
307
  inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
@@ -310,7 +406,7 @@ with gr.Blocks(js=js) as demo:
310
 
311
  tts_button.click(
312
  fn=tts,
313
- inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
314
  ,speed,top_p,top_k,AllFileList
315
  ],
316
  outputs=[AudioList],
 
7
  import zipfile
8
  import random
9
  import xtts
10
+ import re
11
 
12
  DO_CHECK = os.getenv('DO_CHECK', '1')
13
  OUTPUT = "./demo_outputs"
 
84
  return result_dict, filtered_string
85
 
86
 
87
+ def ParsePronucs(PronuncStr):
88
+ # Split the string into lines
89
+ lines = PronuncStr.split('\n')
90
+
91
+ # Initialize an empty dictionary to store key-value pairs
92
+ PronuncWords = []
93
+
94
+ # Iterate through each line
95
+ for line in lines:
96
+ if len(line.strip()) > 0:
97
+ word,*text = line.strip().split('=',1)
98
+ word = word.strip()
99
+ text,*opts = text[0].split("|",1);
100
+ text = text.strip();
101
+
102
+ if len(opts) > 0:
103
+ opts = opts[0].strip().split(",");
104
+ else:
105
+ opts = [];
106
+
107
+
108
+ PronuncWords.append({'word':word, 'text':text, 'opts':opts})
109
+
110
+ return PronuncWords
111
+
112
+
113
  def FindSpeakerByName(name, speakerType):
114
 
115
  srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
 
131
  cloned_speaker_names.append(clone_speaker_name)
132
  return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
133
 
134
+ def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
135
  ,speed,top_p,top_k, AllFileList,progress=gr.Progress()
136
  ):
137
  embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
138
 
139
+
140
  # break at line!
141
  lines = text.split("---");
142
  totalLines = len(lines);
 
149
  CurrentPrefix = DefaultPrefix
150
 
151
 
152
+ # break pronuc
153
+ Pronuncs = ParsePronucs(pronunc)
154
+
155
+
156
+
157
+
158
  AudioList = [];
159
  for line in progress.tqdm(lines, desc="Gerando fala..."):
160
  audioNum += 1;
 
187
 
188
  if not speakerName:
189
  raise ValueError("InvalidSpeaker: "+speakerName)
190
+
191
+
192
+
193
+ FixedText = cleanLine;
194
+
195
+ for pronunc in Pronuncs:
196
+ word = pronunc['word']
197
+ text = pronunc['text']
198
+ opts = pronunc['opts'];
199
+
200
+ flg = re.IGNORECASE
201
+
202
+ if 'cs' in opts:
203
+ flg = 0;
204
+
205
+ FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg)
206
 
207
  ipts = xtts.TTSInputs(
208
  speaker_embedding=embeddings["speaker_embedding"],
209
  gpt_cond_latent=embeddings["gpt_cond_latent"],
210
+ text=FixedText,
211
  language=lang,
212
  temperature=temperature,
213
  speed=speed,
 
295
  cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
296
  AllFileList = gr.State(list([]))
297
 
298
+ gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml")
299
 
300
  with gr.Tab("TTS"):
301
  with gr.Column() as row4:
 
318
  top_k = gr.Number(label="TOP K",value=50)
319
  speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
320
  with gr.Column() as col2:
321
+ with gr.Row():
322
+ text = gr.Textbox(label="text",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
323
+ pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
324
+ with gr.Row():
325
+ lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
326
+ tts_button = gr.Button(value="TTS")
327
  with gr.Column() as col3:
328
  # FileList = gr.FileExplorer(
329
  # glob="*.wav",
 
355
  clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
356
  clone_button = gr.Button(value="Clone speaker")
357
 
358
+
359
+ with gr.Tab("Help"):
360
+ gr.Markdown("""
361
+ Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing).
362
+
363
+ The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI.
364
+
365
+ In this version, we have some customizations that are quite useful.
366
+
367
+ # Multiple audios
368
+ You can generate multiple audios at once by separating the text with three dashes. For example:
369
+
370
+ ```
371
+ Text 1
372
+ ---
373
+ Text 2, line 1
374
+ Text 2, line 2
375
+ ```
376
+
377
+ In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS.
378
+ You can also specify variables that modify certain aspects.
379
+
380
+ For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio.
381
+
382
+ List of variables:
383
+ - `speaker` = name of the speaker
384
+ - `num` = file number (by default, it's the sequential number)
385
+ - `prefix` = file name prefix
386
+
387
+ # Pronunciation adjustment
388
+
389
+ If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations.
390
+
391
+ Simply separate them by each line. Example:
392
+
393
+ ```
394
+ API = A,P,I
395
+ SomeFunctionCode = Function Code
396
+ ```
397
+
398
+ This is useful for mapping foreign words, abbreviations, acronyms, code, etc.
399
+ """)
400
+
401
  clone_button.click(
402
  fn=clone_speaker,
403
  inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
 
406
 
407
  tts_button.click(
408
  fn=tts,
409
+ inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
410
  ,speed,top_p,top_k,AllFileList
411
  ],
412
  outputs=[AudioList],