Spaces:

rrg92
/

xtts

Running on Zero

App Files Files Community

rrg92 commited on Sep 7, 2024

Commit

0a08964

1 Parent(s): 592c43f

License and minor adjusts

Browse files

Files changed (2) hide show

Dockerfile +2 -0
app.py +103 -7

Dockerfile CHANGED Viewed

@@ -13,6 +13,8 @@ RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
 RUN python -m unidic download
 RUN mkdir -p /app/tts_models
 COPY xtts.py .
 COPY app.py .

 RUN python -m unidic download
 RUN mkdir -p /app/tts_models
+RUN python -m pip install spaces
 COPY xtts.py .
 COPY app.py .

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from os.path import abspath
 import zipfile
 import random
 import xtts
 DO_CHECK = os.getenv('DO_CHECK', '1')
 OUTPUT = "./demo_outputs"
@@ -84,6 +84,32 @@ def ExtractVars(input_string):
     return result_dict, filtered_string
 def FindSpeakerByName(name, speakerType):
     srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
@@ -105,11 +131,12 @@ def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
     cloned_speaker_names.append(clone_speaker_name)
     return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
-def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
     ,speed,top_p,top_k, AllFileList,progress=gr.Progress()
 ):
     embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
     # break at line!
     lines = text.split("---");
     totalLines = len(lines);
@@ -122,6 +149,12 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temp
     CurrentPrefix = DefaultPrefix
     AudioList = [];
     for line in progress.tqdm(lines, desc="Gerando fala..."):
         audioNum += 1;
@@ -154,11 +187,27 @@ def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temp
         if not speakerName:
              raise ValueError("InvalidSpeaker: "+speakerName)
         ipts = xtts.TTSInputs(
             speaker_embedding=embeddings["speaker_embedding"],
             gpt_cond_latent=embeddings["gpt_cond_latent"],
-            text=cleanLine,
             language=lang,
             temperature=temperature,
             speed=speed,
@@ -246,6 +295,7 @@ with gr.Blocks(js=js) as demo:
     cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
     AllFileList = gr.State(list([]))
     with gr.Tab("TTS"):
         with gr.Column() as row4:
@@ -268,9 +318,12 @@ with gr.Blocks(js=js) as demo:
                 top_k = gr.Number(label="TOP K",value=50)
                 speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
         with gr.Column() as col2:
-            lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
-            text = gr.Textbox(label="text",lines=4, value="A quick brown fox jumps over the lazy dog.")
-            tts_button = gr.Button(value="TTS")
         with gr.Column() as col3:
             # FileList = gr.FileExplorer(
             #     glob="*.wav",
@@ -302,6 +355,49 @@ with gr.Blocks(js=js) as demo:
             clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
             clone_button = gr.Button(value="Clone speaker")
     clone_button.click(
         fn=clone_speaker,
         inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
@@ -310,7 +406,7 @@ with gr.Blocks(js=js) as demo:
     tts_button.click(
         fn=tts,
-        inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
                 ,speed,top_p,top_k,AllFileList
                 ],
         outputs=[AudioList],

 import zipfile
 import random
 import xtts
+import re
 DO_CHECK = os.getenv('DO_CHECK', '1')
 OUTPUT = "./demo_outputs"
     return result_dict, filtered_string
+def ParsePronucs(PronuncStr):
+    # Split the string into lines
+    lines = PronuncStr.split('\n')
+    # Initialize an empty dictionary to store key-value pairs
+    PronuncWords =  []
+    # Iterate through each line
+    for line in lines:
+        if len(line.strip()) > 0:
+            word,*text = line.strip().split('=',1)
+            word = word.strip()
+            text,*opts = text[0].split("|",1);
+            text = text.strip();
+            if len(opts) > 0:
+                opts = opts[0].strip().split(",");
+            else:
+                opts = [];
+            PronuncWords.append({'word':word, 'text':text, 'opts':opts})
+    return PronuncWords
 def FindSpeakerByName(name, speakerType):
     srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
     cloned_speaker_names.append(clone_speaker_name)
     return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
+def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
     ,speed,top_p,top_k, AllFileList,progress=gr.Progress()
 ):
     embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
     # break at line!
     lines = text.split("---");
     totalLines = len(lines);
     CurrentPrefix = DefaultPrefix
+    # break pronuc
+    Pronuncs = ParsePronucs(pronunc)
     AudioList = [];
     for line in progress.tqdm(lines, desc="Gerando fala..."):
         audioNum += 1;
         if not speakerName:
              raise ValueError("InvalidSpeaker: "+speakerName)
+        FixedText = cleanLine;
+        for pronunc in Pronuncs:
+            word = pronunc['word']
+            text = pronunc['text']
+            opts = pronunc['opts'];
+            flg = re.IGNORECASE
+            if 'cs' in opts:
+                flg = 0;
+            FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg)
         ipts = xtts.TTSInputs(
             speaker_embedding=embeddings["speaker_embedding"],
             gpt_cond_latent=embeddings["gpt_cond_latent"],
+            text=FixedText,
             language=lang,
             temperature=temperature,
             speed=speed,
     cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
     AllFileList = gr.State(list([]))
+    gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml")
     with gr.Tab("TTS"):
         with gr.Column() as row4:
                 top_k = gr.Number(label="TOP K",value=50)
                 speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
         with gr.Column() as col2:
+            with gr.Row():
+                text = gr.Textbox(label="text",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
+                pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
+            with gr.Row():
+                lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
+                tts_button = gr.Button(value="TTS")
         with gr.Column() as col3:
             # FileList = gr.FileExplorer(
             #     glob="*.wav",
             clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
             clone_button = gr.Button(value="Clone speaker")
+    with gr.Tab("Help"):
+         gr.Markdown("""
+Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing).
+The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI.
+In this version, we have some customizations that are quite useful.
+# Multiple audios
+You can generate multiple audios at once by separating the text with three dashes. For example:
+```
+Text 1
+---
+Text 2, line 1
+Text 2, line 2
+```
+In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS.
+You can also specify variables that modify certain aspects.
+For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio.
+List of variables:
+- `speaker` = name of the speaker
+- `num` = file number (by default, it's the sequential number)
+- `prefix` = file name prefix
+# Pronunciation adjustment
+If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations.
+Simply separate them by each line. Example:
+```
+API = A,P,I
+SomeFunctionCode = Function Code
+```
+This is useful for mapping foreign words, abbreviations, acronyms, code, etc.
+         """)
     clone_button.click(
         fn=clone_speaker,
         inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
     tts_button.click(
         fn=tts,
+        inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
                 ,speed,top_p,top_k,AllFileList
                 ],
         outputs=[AudioList],