generateAudio / app.py
SAUL19's picture
Update app.py
2d78591
raw
history blame
3.08 kB
import gradio as gr
from gradio.inputs import Textbox
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf
import nltk
from nltk.tokenize import word_tokenize
device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained(
"microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset(
"Matthijs/cmu-arctic-xvectors", split="validation")
# speaker ids from the embeddings dataset
speakers = {
'awb': 0, # Scottish male
'bdl': 1138, # US male
'clb': 2271, # US female
'jmk': 3403, # Canadian male
'ksp': 4535, # Indian male
'rms': 5667, # US male
'slt': 6799 # US female
}
def generateAudio(text_to_audio, s3_save_as):
def recortar_texto(texto, max_tokens=500):
tokens = word_tokenize(texto)
if len(tokens) <= max_tokens:
return texto
recortado = ' '.join(tokens[:max_tokens])
return recortado
def save_text_to_speech(text, speaker=None):
# Preprocess text and recortar
text = recortar_texto(text, max_tokens=500)
# preprocess text
inputs = processor(text=text, return_tensors="pt").to(device)
if speaker is not None:
# load xvector containing speaker's voice characteristics from a dataset
speaker_embeddings = torch.tensor(
embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
else:
# random vector, meaning a random voice
speaker_embeddings = torch.randn((1, 512)).to(device)
# generate speech with the models
speech = model.generate_speech(
inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
if speaker is not None:
# if we have a speaker, we use the speaker's ID in the filename
output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
else:
# if we don't have a speaker, we use a random string in the filename
random_str = ''.join(random.sample(
string.ascii_letters+string.digits, k=5))
output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
# save the generated speech to a file with 16KHz sampling rate
sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
# return the filename for reference
return output_filename
output_filename = save_text_to_speech(text_to_audio, 2271)
return f"Saved {output_filename}"
iface = gr.Interface(fn=text_to_image, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text")
iface.launch()