Spaces:

PineSearch
/

generateAudio

Paused

App Files Files Community

SAUL19 commited on Jun 23, 2023

Commit

21e33f9

1 Parent(s): e345c78

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -18

app.py CHANGED Viewed

@@ -1,18 +1,25 @@
 import gradio as gr
 from gradio.inputs import Textbox
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
 import torch
 import random
 import string
 import soundfile as sf
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # load the processor
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 # load the model
@@ -21,7 +28,7 @@ model = SpeechT5ForTextToSpeech.from_pretrained(
 # load the vocoder, that is the voice encoder
 vocoder = SpeechT5HifiGan.from_pretrained(
     "microsoft/speecht5_hifigan").to(device)
-# we load this dataset to get the speaker embeddings
 embeddings_dataset = load_dataset(
     "Matthijs/cmu-arctic-xvectors", split="validation")
@@ -38,18 +45,21 @@ speakers = {
 def generateAudio(text_to_audio, s3_save_as):
-    def recortar_texto(texto, max_tokens=500):
-        tokens = word_tokenize(texto)
         if len(tokens) <= max_tokens:
-            return texto
-        recortado = ' '.join(tokens[:max_tokens])
-        return recortado
     def save_text_to_speech(text, speaker=None):
         # Preprocess text and recortar
-        text = recortar_texto(text, max_tokens=500)
         # preprocess text
         inputs = processor(text=text, return_tensors="pt").to(device)
         if speaker is not None:
@@ -70,16 +80,33 @@ def generateAudio(text_to_audio, s3_save_as):
             random_str = ''.join(random.sample(
                 string.ascii_letters+string.digits, k=5))
             output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
-        # save the generated speech to a file with 16KHz sampling rate
-        sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
-        # return the filename for reference
-        return output_filename
-    output_filename = save_text_to_speech(text_to_audio, 2271)
-    return f"Saved {output_filename}"
-iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text")
-iface.launch()

 import gradio as gr
 from gradio.inputs import Textbox
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
+import re
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
 import torch
 import random
 import string
 import soundfile as sf
+import boto3
+from io import BytesIO
+import os
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # load the processor
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 # load the model
 # load the vocoder, that is the voice encoder
 vocoder = SpeechT5HifiGan.from_pretrained(
     "microsoft/speecht5_hifigan").to(device)
+# load the dataset to get the speaker embeddings
 embeddings_dataset = load_dataset(
     "Matthijs/cmu-arctic-xvectors", split="validation")
 def generateAudio(text_to_audio, s3_save_as):
+    def cut_text(text, max_tokens=500):
+        # Remove non-alphanumeric characters, except periods and commas
+        text = re.sub(r"[^\w\s.,]", "", text)
+        tokens = word_tokenize(text_to_audio)
         if len(tokens) <= max_tokens:
+            return text
+        cut = ' '.join(tokens[:max_tokens])
+        return cut
     def save_text_to_speech(text, speaker=None):
         # Preprocess text and recortar
+        text = cut_text(text, max_tokens=500)
         # preprocess text
         inputs = processor(text=text, return_tensors="pt").to(device)
         if speaker is not None:
             random_str = ''.join(random.sample(
                 string.ascii_letters+string.digits, k=5))
             output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
+        # Save the generated speech to BytesIO buffer
+        audio_buffer = BytesIO()
+        sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000)
+        audio_buffer.seek(0)
+        # Upload the audio buffer to S3
+        s3_key = f"{s3_save_as}.mp3"
+        s3 = boto3.client(
+            's3',
+            aws_access_key_id=AWS_ACCESS_KEY_ID,
+            aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+        )
+        s3.upload_fileobj(audio_buffer, S3_BUCKET_NAME, s3_key)
+        # Return the S3 URL of the uploaded audio file
+        s3_url = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{s3_key}"
+        return s3_url
+    s3_url = save_text_to_speech(text_to_audio, speakers["clb"])
+    return f"Saved audio: {s3_url}"
+iface = gr.Interface(
+    fn=generateAudio,
+    inputs=[Textbox(label="Text to Audio"), Textbox(label="S3 Save As")],
+    outputs="text"
+)
+iface.launch()