Spaces:

PineSearch
/

generateAudio

Paused

App Files Files Community

SAUL19 commited on Jun 23, 2023

Commit

2d78591

1 Parent(s): 37acd6d

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -3

app.py CHANGED Viewed

@@ -1,7 +1,83 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

 import gradio as gr
+from gradio.inputs import Textbox
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import torch
+import random
+import string
+import soundfile as sf
+import nltk
+from nltk.tokenize import word_tokenize
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load the processor
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+# load the model
+model = SpeechT5ForTextToSpeech.from_pretrained(
+    "microsoft/speecht5_tts").to(device)
+# load the vocoder, that is the voice encoder
+vocoder = SpeechT5HifiGan.from_pretrained(
+    "microsoft/speecht5_hifigan").to(device)
+# we load this dataset to get the speaker embeddings
+embeddings_dataset = load_dataset(
+    "Matthijs/cmu-arctic-xvectors", split="validation")
+# speaker ids from the embeddings dataset
+speakers = {
+    'awb': 0,     # Scottish male
+    'bdl': 1138,  # US male
+    'clb': 2271,  # US female
+    'jmk': 3403,  # Canadian male
+    'ksp': 4535,  # Indian male
+    'rms': 5667,  # US male
+    'slt': 6799   # US female
+}
+def generateAudio(text_to_audio, s3_save_as):
+    def recortar_texto(texto, max_tokens=500):
+        tokens = word_tokenize(texto)
+        if len(tokens) <= max_tokens:
+            return texto
+        recortado = ' '.join(tokens[:max_tokens])
+        return recortado
+    def save_text_to_speech(text, speaker=None):
+        # Preprocess text and recortar
+        text = recortar_texto(text, max_tokens=500)
+        # preprocess text
+        inputs = processor(text=text, return_tensors="pt").to(device)
+        if speaker is not None:
+            # load xvector containing speaker's voice characteristics from a dataset
+            speaker_embeddings = torch.tensor(
+                embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
+        else:
+            # random vector, meaning a random voice
+            speaker_embeddings = torch.randn((1, 512)).to(device)
+        # generate speech with the models
+        speech = model.generate_speech(
+            inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+        if speaker is not None:
+            # if we have a speaker, we use the speaker's ID in the filename
+            output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
+        else:
+            # if we don't have a speaker, we use a random string in the filename
+            random_str = ''.join(random.sample(
+                string.ascii_letters+string.digits, k=5))
+            output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
+        # save the generated speech to a file with 16KHz sampling rate
+        sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
+        # return the filename for reference
+        return output_filename
+    output_filename = save_text_to_speech(text_to_audio, 2271)
+    return f"Saved {output_filename}"
+iface = gr.Interface(fn=text_to_image, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text")
 iface.launch()