import gradio as gr from gradio.inputs import Textbox from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset import torch import random import string import soundfile as sf import nltk from nltk.tokenize import word_tokenize device = "cuda" if torch.cuda.is_available() else "cpu" # load the processor processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") # load the model model = SpeechT5ForTextToSpeech.from_pretrained( "microsoft/speecht5_tts").to(device) # load the vocoder, that is the voice encoder vocoder = SpeechT5HifiGan.from_pretrained( "microsoft/speecht5_hifigan").to(device) # we load this dataset to get the speaker embeddings embeddings_dataset = load_dataset( "Matthijs/cmu-arctic-xvectors", split="validation") # speaker ids from the embeddings dataset speakers = { 'awb': 0, # Scottish male 'bdl': 1138, # US male 'clb': 2271, # US female 'jmk': 3403, # Canadian male 'ksp': 4535, # Indian male 'rms': 5667, # US male 'slt': 6799 # US female } def generateAudio(text_to_audio, s3_save_as): def recortar_texto(texto, max_tokens=500): tokens = word_tokenize(texto) if len(tokens) <= max_tokens: return texto recortado = ' '.join(tokens[:max_tokens]) return recortado def save_text_to_speech(text, speaker=None): # Preprocess text and recortar text = recortar_texto(text, max_tokens=500) # preprocess text inputs = processor(text=text, return_tensors="pt").to(device) if speaker is not None: # load xvector containing speaker's voice characteristics from a dataset speaker_embeddings = torch.tensor( embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device) else: # random vector, meaning a random voice speaker_embeddings = torch.randn((1, 512)).to(device) # generate speech with the models speech = model.generate_speech( inputs["input_ids"], speaker_embeddings, vocoder=vocoder) if speaker is not None: # if we have a speaker, we use the speaker's ID in the filename output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3" else: # if we don't have a speaker, we use a random string in the filename random_str = ''.join(random.sample( string.ascii_letters+string.digits, k=5)) output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3" # save the generated speech to a file with 16KHz sampling rate sf.write(output_filename, speech.cpu().numpy(), samplerate=16000) # return the filename for reference return output_filename output_filename = save_text_to_speech(text_to_audio, 2271) return f"Saved {output_filename}" iface = gr.Interface(fn=text_to_image, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text") iface.launch()