Spaces:

Kevin676
/

Shanghainese-TTS-demo

Paused

App Files Files Community

Shanghainese-TTS-demo / app.py

Kevin676

Duplicate from Kevin676/Shanghainese-TTS

9e1a4da over 1 year ago

raw

history blame contribute delete

7.59 kB

	import torch
	import librosa
	import commons
	import utils
	from models import SynthesizerTrn
	from text import text_to_sequence
	import numpy as np
	from mel_processing import spectrogram_torch
	import gradio as gr
	from text.cleaners import shanghainese_cleaners

	from transformers import AutoModel, AutoTokenizer
	from TTS.api import TTS

	tts = TTS("tts_models/zh-CN/baker/tacotron2-DDC-GST")

	tts1 = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)

	import torchaudio
	from speechbrain.pretrained import SpectralMaskEnhancement

	enhance_model = SpectralMaskEnhancement.from_hparams(
	source="speechbrain/metricgan-plus-voicebank",
	savedir="pretrained_models/metricgan-plus-voicebank",
	run_opts={"device":"cuda"},
	)

	from denoiser import pretrained
	from denoiser.dsp import convert_audio

	model1 = pretrained.dns64().cuda()

	tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
	model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
	model = model.eval()

	def predict(input, history=None):
	if history is None:
	history = []
	response, history = model.chat(tokenizer, input, history)

	return history, history, response

	def chinese(text_cn, upload1, VoiceMicrophone1):

	if upload1 is not None:

	tts.tts_with_vc_to_file(
	" ".join(text_cn.split()) + "。",
	speaker_wav=upload1,
	file_path="output0.wav"
	)

	else:
	tts.tts_with_vc_to_file(
	" ".join(text_cn.split()) + "。",
	speaker_wav=VoiceMicrophone1,
	file_path="output0.wav"
	)

	noisy = enhance_model.load_audio(
	"output0.wav"
	).unsqueeze(0)

	enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
	torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

	return "enhanced.wav"

	def english(text_en, upload, VoiceMicrophone):
	if upload is not None:
	tts1.tts_to_file(text_en.strip(), speaker_wav = upload, language="en", file_path="output.wav")

	else:
	tts1.tts_to_file(text_en.strip(), speaker_wav = VoiceMicrophone, language="en", file_path="output.wav")

	wav, sr = torchaudio.load("output.wav")
	wav = convert_audio(wav.cuda(), sr, model1.sample_rate, model1.chin)
	with torch.no_grad():
	denoised = model1(wav[None])[0]

	torchaudio.save("denoise.wav", denoised.data.cpu(), model1.sample_rate)

	noisy = enhance_model.load_audio(
	"denoise.wav"
	).unsqueeze(0)

	enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
	torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

	return "enhanced.wav"

	def clean_text(text,ipa_input):
	if ipa_input:
	return shanghainese_cleaners(text)
	return text


	def get_text(text, hps, cleaned=False):
	if cleaned:
	text_norm = text_to_sequence(text, hps.symbols, [])
	else:
	text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
	if hps.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = torch.LongTensor(text_norm)
	return text_norm


	def speech_synthesize(text, cleaned, length_scale):
	text=text.replace('\n','')
	print(text)
	stn_tst = get_text(text, hps_ms, cleaned)
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
	sid = torch.LongTensor([0])
	audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
	return (hps_ms.data.sampling_rate, audio)



	hps_ms = utils.get_hparams_from_file('model/config.json')
	n_speakers = hps_ms.data.n_speakers
	n_symbols = len(hps_ms.symbols)
	speakers = hps_ms.speakers

	net_g_ms = SynthesizerTrn(
	n_symbols,
	hps_ms.data.filter_length // 2 + 1,
	hps_ms.train.segment_size // hps_ms.data.hop_length,
	n_speakers=n_speakers,
	**hps_ms.model)
	_ = net_g_ms.eval()
	utils.load_checkpoint('model/model.pth', net_g_ms)

	with gr.Blocks() as demo:
	gr.Markdown(
	""" # <center>🥳💬💕 - TalktoAI，随时随地，谈天说地！</center>

	### <center>🤖 - 让有人文关怀的AI造福每一个人！AI向善，文明璀璨！TalktoAI - Enable the future！</center>

	"""
	)
	state = gr.State([])
	chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300)
	res = gr.Textbox(lines=1, placeholder="最新的回答在这里(此内容可编辑，用作声音克隆的文本)", show_label = False).style(container=False)
	with gr.Row():
	txt = gr.Textbox(label = "说点什么吧(中英皆可)", lines=1)
	button = gr.Button("开始对话吧")
	txt.submit(predict, [txt, state], [chatbot, state, res])
	button.click(predict, [txt, state], [chatbot, state, res])

	with gr.Row().style(mobile_collapse=False, equal_height=True):
	inp3 = res
	inp4 = gr.Audio(source="upload", label = "请上传您喜欢的声音(wav/mp3文件)；长语音(~90s)、女声效果更好", type="filepath")
	inp5 = gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音，与文件上传二选一即可')
	btn1 = gr.Button("用喜欢的声音听一听吧(中文)")

	btn2 = gr.Button("用喜欢的声音听一听吧(英文)")
	with gr.Row():
	out1 = gr.Audio(label="为您合成的专属声音(中文)")
	out2 = gr.Audio(label="为您合成的专属声音(英文)")
	btn1.click(chinese, [inp3, inp4, inp5], [out1])
	btn2.click(english, [inp3, inp4, inp5], [out2])

	text_input = res
	cleaned_text=gr.Checkbox(label='IPA Input',default=True, visible = False)
	length_scale=gr.Slider(0.5,2,1,step=0.1,label='Speaking Speed',interactive=True, visible = False)
	with gr.Row().style(mobile_collapse=False, equal_height=True):
	tts_button = gr.Button('彩蛋:上海话合成')
	audio_output = gr.Audio(label='听一听上海话吧')
	cleaned_text.change(clean_text,[text_input,cleaned_text],[text_input])
	tts_button.click(speech_synthesize,[text_input,cleaned_text,length_scale],[audio_output])

	gr.Markdown(
	""" ### <center>注意❗：请不要输入或生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及娱乐使用。用户输入或生成的内容与程序开发者无关，请自觉合法合规使用，违反者一切后果自负。</center>

	### <center>Model by [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b). Thanks to [THUDM](https://github.com/THUDM) and [CjangCjengh](https://github.com/CjangCjengh). Please follow me on [Bilibili](https://space.bilibili.com/501495851?spm_id_from=333.1007.0.0).</center>

	"""
	)

	gr.HTML('''
	<div class="footer">
	<p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing. - Steve Jobs
	</p>
	<p>注：中文声音克隆实际上是通过声音转换(Voice Conversion)实现，所以输出结果可能更像是一种新的声音，效果不一定很理想，希望大家多多包涵，之后我们也会不断迭代该程序的！为了实现更好的效果，使用中文声音克隆时请尽量上传女声。
	</p>
	</div>
	''')

	demo.queue().launch(show_error=True)