SAUL19 commited on
Commit
2d78591
·
1 Parent(s): 37acd6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -3
app.py CHANGED
@@ -1,7 +1,83 @@
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  iface.launch()
 
1
  import gradio as gr
2
+ from gradio.inputs import Textbox
3
 
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ from datasets import load_dataset
6
+ import torch
7
+ import random
8
+ import string
9
+ import soundfile as sf
10
+ import nltk
11
+ from nltk.tokenize import word_tokenize
12
 
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ # load the processor
15
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
+ # load the model
17
+ model = SpeechT5ForTextToSpeech.from_pretrained(
18
+ "microsoft/speecht5_tts").to(device)
19
+ # load the vocoder, that is the voice encoder
20
+ vocoder = SpeechT5HifiGan.from_pretrained(
21
+ "microsoft/speecht5_hifigan").to(device)
22
+ # we load this dataset to get the speaker embeddings
23
+ embeddings_dataset = load_dataset(
24
+ "Matthijs/cmu-arctic-xvectors", split="validation")
25
+
26
+ # speaker ids from the embeddings dataset
27
+ speakers = {
28
+ 'awb': 0, # Scottish male
29
+ 'bdl': 1138, # US male
30
+ 'clb': 2271, # US female
31
+ 'jmk': 3403, # Canadian male
32
+ 'ksp': 4535, # Indian male
33
+ 'rms': 5667, # US male
34
+ 'slt': 6799 # US female
35
+ }
36
+
37
+ def generateAudio(text_to_audio, s3_save_as):
38
+
39
+ def recortar_texto(texto, max_tokens=500):
40
+ tokens = word_tokenize(texto)
41
+ if len(tokens) <= max_tokens:
42
+ return texto
43
+
44
+ recortado = ' '.join(tokens[:max_tokens])
45
+ return recortado
46
+
47
+
48
+ def save_text_to_speech(text, speaker=None):
49
+ # Preprocess text and recortar
50
+ text = recortar_texto(text, max_tokens=500)
51
+ # preprocess text
52
+ inputs = processor(text=text, return_tensors="pt").to(device)
53
+ if speaker is not None:
54
+ # load xvector containing speaker's voice characteristics from a dataset
55
+ speaker_embeddings = torch.tensor(
56
+ embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
57
+ else:
58
+ # random vector, meaning a random voice
59
+ speaker_embeddings = torch.randn((1, 512)).to(device)
60
+ # generate speech with the models
61
+ speech = model.generate_speech(
62
+ inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
63
+ if speaker is not None:
64
+ # if we have a speaker, we use the speaker's ID in the filename
65
+ output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
66
+ else:
67
+ # if we don't have a speaker, we use a random string in the filename
68
+ random_str = ''.join(random.sample(
69
+ string.ascii_letters+string.digits, k=5))
70
+ output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
71
+ # save the generated speech to a file with 16KHz sampling rate
72
+ sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
73
+ # return the filename for reference
74
+ return output_filename
75
+
76
+
77
+
78
+ output_filename = save_text_to_speech(text_to_audio, 2271)
79
+
80
+ return f"Saved {output_filename}"
81
+
82
+ iface = gr.Interface(fn=text_to_image, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text")
83
  iface.launch()