import gradio as gr import argparse import yaml from vietTTS.hifigan.mel2wave import mel2wave from vietTTS.nat.text2mel import text2mel from vietTTS.synthesizer import nat_normalize_text import numpy as np import gradio as gr import re from vietnam_number import n2w from vietnam_number import n2w_single from synthesize import synthesizer import noisereduce as nr import os import scipy.io.wavfile as wavf from scipy.io import wavfile TITLE = "Saltlux Text to Speech" DESCRIPTION = "SLT Vietnamese Text to speech demo." class GradioApplication: def __init__(self): inputs = prepare_input() outputs = prepare_output() self.iface = gr.Interface(fn=self.infer, title=TITLE, description=DESCRIPTION, inputs=inputs, outputs=outputs, allow_flagging='never') def infer(self, text, lang, duration_rate): if(lang == "VietTTS"): return using_viettts(text,duration_rate) else : return using_tacotron(text) return 1 def run(self): try: self.iface.launch(height=900, share=False, server_port=7086, enable_queue=True) except KeyboardInterrupt: gr.close_all() def prepare_input(): text_input = gr.Textbox(lines=2, placeholder="Lựa chọn model test - VietTTS và Tacotron 2 + Univnet", value="Thành phố muốn thí điểm thu thuế bất động sản thứ 2, tự quyết nhiều quyết định đầu tư để thu hút nguồn vốn tư nhân", label="Text") lang_input = gr.Radio(['VietTTS', 'Tacotron2'], type='value', value=None, label="Model select") duration_rate_input = gr.Slider(minimum=0.2, maximum=1, step=0.1, value=1.0, label="Duration (The bigger the value, the slower the speech) - only for vietTTS") return [text_input, lang_input, duration_rate_input] def prepare_output(): outputs = [gr.Audio(label="Output before denoise"),gr.Audio(label="Output after denoise")] return outputs def text_to_speech(text,stop_duration): print("starting") # prevent too long text if len(text) > 500: text = text[:500] # text_to_speech_tacotron(text) # stop_duration_float = float(stop_duration_text) text = clean_text(text) text = nat_normalize_text(text) mel = text2mel( text, "lexicon.txt", stop_duration, "acoustic_latest_ckpt.pickle", "duration_latest_ckpt.pickle", ) wave = mel2wave(mel, "config.json", "hk_hifi.pickle") return (wave * (2**15)).astype(np.int16) def text_to_speech_tacotron(text): print("starting") # prevent too long text if len(text) > 500: text = text[:500] wav = synthesizer.tts(text) output = './out.wav' synthesizer.save_wav(wav,output) return output def using_viettts(text,stop_duration): y = text_to_speech(text,stop_duration) fs = 16000 output = './out.wav' output_denoise = './output_denoise.wav' wavf.write(output, fs, y) rate, data = wavfile.read(output) # perform noise reduction reduced_noise = nr.reduce_noise(y=data, sr=rate) wavfile.write(output_denoise, rate, reduced_noise) return (output,output_denoise) def using_tacotron(text): y = text_to_speech_tacotron(text) output_denoise = "./output_denoise.wav" rate, data = wavfile.read(y) # perform noise reduction reduced_noise = nr.reduce_noise(y=data, sr=rate) wavfile.write(output_denoise, rate, reduced_noise) return (y,output_denoise) def clean_text(test_string): list_word = test_string.split() # print(list_word) regex = r"\d{2}(?P[-/])\d{1,2}(?P=sep)\d{4}" for word in list_word : try: # print(word) searchbox_result = re.match(regex, word) day = searchbox_result.group(0) day2 = day day2 = day2.replace('/',' ').replace('-',' ') list_date = day2.split(' ') date_result = 'Ngày ' + n2w(list_date[0]) + ' tháng ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1]) + ' năm ' + n2w(list_date[2]) # print(date_result) test_string = test_string.replace(word, date_result) except AttributeError: # print(word) # print("can't make a group") continue # print(test_string) regex2 = r"\d{2}(?P[-/])\d{1,2}" for word in list_word : try: # print(word) searchbox_result = re.match(regex2, word) day = searchbox_result.group(0) day2 = day day2 = day2.replace('/',' ').replace('-',' ') list_date = day2.split(' ') date_result = 'Ngày ' + n2w(list_date[0]) + ' tháng ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1]) # print(date_result) test_string = test_string.replace(word, date_result) except AttributeError: # print(word) # print("can't make a group") continue # print(test_string) regex3 = r"\d{1,2}(?P[h:])\d{1,2}" for word in list_word : try: # print(word) searchbox_result = re.match(regex3, word) day = searchbox_result.group(0) day2 = day day2 = day2.replace('h',' ').replace(':',' ') list_date = day2.split(' ') date_result = n2w(list_date[0]) + ' giờ ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1]) + ' phút ' # print(date_result) test_string = test_string.replace(word, date_result) except AttributeError: # print(word) # print("can't make a group") continue print(test_string) for word in list_word : try: if word.isdigit() : # print(word) text_result = n2w_single(word) # print(text_result) test_string = test_string.replace(word, text_result, 1) except AttributeError: # print(word) print("can't make a group") continue return test_string if __name__ == '__main__': # args = parse_args() gradio_application = GradioApplication() gradio_application.run()