tts / app.py
tobiccino's picture
update ui tacotron
8c70653
raw
history blame
6.6 kB
import gradio as gr
import argparse
import yaml
from vietTTS.hifigan.mel2wave import mel2wave
from vietTTS.nat.text2mel import text2mel
from vietTTS.synthesizer import nat_normalize_text
import numpy as np
import gradio as gr
import re
from vietnam_number import n2w
from vietnam_number import n2w_single
from synthesize import synthesizer
import noisereduce as nr
import os
import scipy.io.wavfile as wavf
from scipy.io import wavfile
TITLE = "Saltlux Text to Speech"
DESCRIPTION = "SLT Vietnamese Text to speech demo."
class GradioApplication:
def __init__(self):
inputs = prepare_input()
outputs = prepare_output()
self.iface = gr.Interface(fn=self.infer,
title=TITLE,
description=DESCRIPTION,
inputs=inputs,
outputs=outputs,
allow_flagging='never')
def infer(self, text, lang, duration_rate):
if(lang == "VietTTS"):
return using_viettts(text,duration_rate)
else :
return using_tacotron(text)
return 1
def run(self):
try:
self.iface.launch(height=900,
share=False, server_port=7086,
enable_queue=True)
except KeyboardInterrupt:
gr.close_all()
def prepare_input():
text_input = gr.Textbox(lines=2,
placeholder="Lựa chọn model test - VietTTS và Tacotron 2 + Univnet",
value="Thành phố muốn thí điểm thu thuế bất động sản thứ 2, tự quyết nhiều quyết định đầu tư để thu hút nguồn vốn tư nhân",
label="Text")
lang_input = gr.Radio(['VietTTS', 'Tacotron2'],
type='value',
value=None,
label="Model select")
duration_rate_input = gr.Slider(minimum=0.2,
maximum=1,
step=0.1,
value=1.0,
label="Duration (The bigger the value, the slower the speech) - only for vietTTS")
return [text_input, lang_input, duration_rate_input]
def prepare_output():
outputs = [gr.Audio(label="Output before denoise"),gr.Audio(label="Output after denoise")]
return outputs
def text_to_speech(text,stop_duration):
print("starting")
# prevent too long text
if len(text) > 500:
text = text[:500]
# text_to_speech_tacotron(text)
# stop_duration_float = float(stop_duration_text)
text = clean_text(text)
text = nat_normalize_text(text)
mel = text2mel(
text,
"lexicon.txt",
stop_duration,
"acoustic_latest_ckpt.pickle",
"duration_latest_ckpt.pickle",
)
wave = mel2wave(mel, "config.json", "hk_hifi.pickle")
return (wave * (2**15)).astype(np.int16)
def text_to_speech_tacotron(text):
print("starting")
# prevent too long text
if len(text) > 500:
text = text[:500]
wav = synthesizer.tts(text)
output = './out.wav'
synthesizer.save_wav(wav,output)
return output
def using_viettts(text,stop_duration):
y = text_to_speech(text,stop_duration)
fs = 16000
output = './out.wav'
output_denoise = './output_denoise.wav'
wavf.write(output, fs, y)
rate, data = wavfile.read(output)
# perform noise reduction
reduced_noise = nr.reduce_noise(y=data, sr=rate)
wavfile.write(output_denoise, rate, reduced_noise)
return (output,output_denoise)
def using_tacotron(text):
y = text_to_speech_tacotron(text)
output_denoise = "./output_denoise.wav"
rate, data = wavfile.read(y)
# perform noise reduction
reduced_noise = nr.reduce_noise(y=data, sr=rate)
wavfile.write(output_denoise, rate, reduced_noise)
return (y,output_denoise)
def clean_text(test_string):
list_word = test_string.split()
# print(list_word)
regex = r"\d{2}(?P<sep>[-/])\d{1,2}(?P=sep)\d{4}"
for word in list_word :
try:
# print(word)
searchbox_result = re.match(regex, word)
day = searchbox_result.group(0)
day2 = day
day2 = day2.replace('/',' ').replace('-',' ')
list_date = day2.split(' ')
date_result = 'Ngày ' + n2w(list_date[0]) + ' tháng ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1]) + ' năm ' + n2w(list_date[2])
# print(date_result)
test_string = test_string.replace(word, date_result)
except AttributeError:
# print(word)
# print("can't make a group")
continue
# print(test_string)
regex2 = r"\d{2}(?P<sep>[-/])\d{1,2}"
for word in list_word :
try:
# print(word)
searchbox_result = re.match(regex2, word)
day = searchbox_result.group(0)
day2 = day
day2 = day2.replace('/',' ').replace('-',' ')
list_date = day2.split(' ')
date_result = 'Ngày ' + n2w(list_date[0]) + ' tháng ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1])
# print(date_result)
test_string = test_string.replace(word, date_result)
except AttributeError:
# print(word)
# print("can't make a group")
continue
# print(test_string)
regex3 = r"\d{1,2}(?P<sep>[h:])\d{1,2}"
for word in list_word :
try:
# print(word)
searchbox_result = re.match(regex3, word)
day = searchbox_result.group(0)
day2 = day
day2 = day2.replace('h',' ').replace(':',' ')
list_date = day2.split(' ')
date_result = n2w(list_date[0]) + ' giờ ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1]) + ' phút '
# print(date_result)
test_string = test_string.replace(word, date_result)
except AttributeError:
# print(word)
# print("can't make a group")
continue
print(test_string)
for word in list_word :
try:
if word.isdigit() :
# print(word)
text_result = n2w_single(word)
# print(text_result)
test_string = test_string.replace(word, text_result, 1)
except AttributeError:
# print(word)
print("can't make a group")
continue
return test_string
if __name__ == '__main__':
# args = parse_args()
gradio_application = GradioApplication()
gradio_application.run()