Spaces:

PineSearch
/

generateAudio

Paused

App Files Files Community

generateAudio / app.py

SAUL19

Update app.py

21e33f9 over 1 year ago

raw

history blame

3.85 kB

	import gradio as gr
	from gradio.inputs import Textbox
	import nltk
	nltk.download('punkt')
	from nltk.tokenize import word_tokenize
	import re
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset
	import torch
	import random
	import string
	import soundfile as sf
	import boto3
	from io import BytesIO
	import os

	AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
	AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
	S3_BUCKET_NAME = os.getenv("BUCKET_NAME")

	device = "cuda" if torch.cuda.is_available() else "cpu"

	# load the processor
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	# load the model
	model = SpeechT5ForTextToSpeech.from_pretrained(
	"microsoft/speecht5_tts").to(device)
	# load the vocoder, that is the voice encoder
	vocoder = SpeechT5HifiGan.from_pretrained(
	"microsoft/speecht5_hifigan").to(device)
	# load the dataset to get the speaker embeddings
	embeddings_dataset = load_dataset(
	"Matthijs/cmu-arctic-xvectors", split="validation")

	# speaker ids from the embeddings dataset
	speakers = {
	'awb': 0, # Scottish male
	'bdl': 1138, # US male
	'clb': 2271, # US female
	'jmk': 3403, # Canadian male
	'ksp': 4535, # Indian male
	'rms': 5667, # US male
	'slt': 6799 # US female
	}

	def generateAudio(text_to_audio, s3_save_as):

	def cut_text(text, max_tokens=500):
	# Remove non-alphanumeric characters, except periods and commas
	text = re.sub(r"[^\w\s.,]", "", text)

	tokens = word_tokenize(text_to_audio)
	if len(tokens) <= max_tokens:
	return text

	cut = ' '.join(tokens[:max_tokens])
	return cut


	def save_text_to_speech(text, speaker=None):
	# Preprocess text and recortar
	text = cut_text(text, max_tokens=500)
	# preprocess text
	inputs = processor(text=text, return_tensors="pt").to(device)
	if speaker is not None:
	# load xvector containing speaker's voice characteristics from a dataset
	speaker_embeddings = torch.tensor(
	embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
	else:
	# random vector, meaning a random voice
	speaker_embeddings = torch.randn((1, 512)).to(device)
	# generate speech with the models
	speech = model.generate_speech(
	inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
	if speaker is not None:
	# if we have a speaker, we use the speaker's ID in the filename
	output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
	else:
	# if we don't have a speaker, we use a random string in the filename
	random_str = ''.join(random.sample(
	string.ascii_letters+string.digits, k=5))
	output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"

	# Save the generated speech to BytesIO buffer
	audio_buffer = BytesIO()
	sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000)
	audio_buffer.seek(0)

	# Upload the audio buffer to S3
	s3_key = f"{s3_save_as}.mp3"
	s3 = boto3.client(
	's3',
	aws_access_key_id=AWS_ACCESS_KEY_ID,
	aws_secret_access_key=AWS_SECRET_ACCESS_KEY
	)
	s3.upload_fileobj(audio_buffer, S3_BUCKET_NAME, s3_key)

	# Return the S3 URL of the uploaded audio file
	s3_url = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{s3_key}"
	return s3_url


	s3_url = save_text_to_speech(text_to_audio, speakers["clb"])
	return f"Saved audio: {s3_url}"


	iface = gr.Interface(
	fn=generateAudio,
	inputs=[Textbox(label="Text to Audio"), Textbox(label="S3 Save As")],
	outputs="text"
	)
	iface.launch()