SAUL19 commited on
Commit
21e33f9
·
1 Parent(s): e345c78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -18
app.py CHANGED
@@ -1,18 +1,25 @@
1
  import gradio as gr
2
  from gradio.inputs import Textbox
3
-
4
  import nltk
5
  nltk.download('punkt')
6
  from nltk.tokenize import word_tokenize
7
-
8
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
  from datasets import load_dataset
10
  import torch
11
  import random
12
  import string
13
  import soundfile as sf
 
 
 
 
 
 
 
14
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
16
  # load the processor
17
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
18
  # load the model
@@ -21,7 +28,7 @@ model = SpeechT5ForTextToSpeech.from_pretrained(
21
  # load the vocoder, that is the voice encoder
22
  vocoder = SpeechT5HifiGan.from_pretrained(
23
  "microsoft/speecht5_hifigan").to(device)
24
- # we load this dataset to get the speaker embeddings
25
  embeddings_dataset = load_dataset(
26
  "Matthijs/cmu-arctic-xvectors", split="validation")
27
 
@@ -38,18 +45,21 @@ speakers = {
38
 
39
  def generateAudio(text_to_audio, s3_save_as):
40
 
41
- def recortar_texto(texto, max_tokens=500):
42
- tokens = word_tokenize(texto)
 
 
 
43
  if len(tokens) <= max_tokens:
44
- return texto
45
 
46
- recortado = ' '.join(tokens[:max_tokens])
47
- return recortado
48
 
49
 
50
  def save_text_to_speech(text, speaker=None):
51
  # Preprocess text and recortar
52
- text = recortar_texto(text, max_tokens=500)
53
  # preprocess text
54
  inputs = processor(text=text, return_tensors="pt").to(device)
55
  if speaker is not None:
@@ -70,16 +80,33 @@ def generateAudio(text_to_audio, s3_save_as):
70
  random_str = ''.join(random.sample(
71
  string.ascii_letters+string.digits, k=5))
72
  output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
73
- # save the generated speech to a file with 16KHz sampling rate
74
- sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
75
- # return the filename for reference
76
- return output_filename
77
 
 
 
 
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- output_filename = save_text_to_speech(text_to_audio, 2271)
81
-
82
- return f"Saved {output_filename}"
83
 
84
- iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="s3_save_as")], outputs="text")
85
- iface.launch()
 
 
 
 
 
1
  import gradio as gr
2
  from gradio.inputs import Textbox
 
3
  import nltk
4
  nltk.download('punkt')
5
  from nltk.tokenize import word_tokenize
6
+ import re
7
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
8
  from datasets import load_dataset
9
  import torch
10
  import random
11
  import string
12
  import soundfile as sf
13
+ import boto3
14
+ from io import BytesIO
15
+ import os
16
+
17
+ AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
18
+ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
19
+ S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
20
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
+
23
  # load the processor
24
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
25
  # load the model
 
28
  # load the vocoder, that is the voice encoder
29
  vocoder = SpeechT5HifiGan.from_pretrained(
30
  "microsoft/speecht5_hifigan").to(device)
31
+ # load the dataset to get the speaker embeddings
32
  embeddings_dataset = load_dataset(
33
  "Matthijs/cmu-arctic-xvectors", split="validation")
34
 
 
45
 
46
  def generateAudio(text_to_audio, s3_save_as):
47
 
48
+ def cut_text(text, max_tokens=500):
49
+ # Remove non-alphanumeric characters, except periods and commas
50
+ text = re.sub(r"[^\w\s.,]", "", text)
51
+
52
+ tokens = word_tokenize(text_to_audio)
53
  if len(tokens) <= max_tokens:
54
+ return text
55
 
56
+ cut = ' '.join(tokens[:max_tokens])
57
+ return cut
58
 
59
 
60
  def save_text_to_speech(text, speaker=None):
61
  # Preprocess text and recortar
62
+ text = cut_text(text, max_tokens=500)
63
  # preprocess text
64
  inputs = processor(text=text, return_tensors="pt").to(device)
65
  if speaker is not None:
 
80
  random_str = ''.join(random.sample(
81
  string.ascii_letters+string.digits, k=5))
82
  output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
 
 
 
 
83
 
84
+ # Save the generated speech to BytesIO buffer
85
+ audio_buffer = BytesIO()
86
+ sf.write(audio_buffer, speech.cpu().numpy(), samplerate=16000)
87
+ audio_buffer.seek(0)
88
 
89
+ # Upload the audio buffer to S3
90
+ s3_key = f"{s3_save_as}.mp3"
91
+ s3 = boto3.client(
92
+ 's3',
93
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
94
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
95
+ )
96
+ s3.upload_fileobj(audio_buffer, S3_BUCKET_NAME, s3_key)
97
+
98
+ # Return the S3 URL of the uploaded audio file
99
+ s3_url = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{s3_key}"
100
+ return s3_url
101
+
102
+
103
+ s3_url = save_text_to_speech(text_to_audio, speakers["clb"])
104
+ return f"Saved audio: {s3_url}"
105
 
 
 
 
106
 
107
+ iface = gr.Interface(
108
+ fn=generateAudio,
109
+ inputs=[Textbox(label="Text to Audio"), Textbox(label="S3 Save As")],
110
+ outputs="text"
111
+ )
112
+ iface.launch()