Spaces:
Runtime error
Runtime error
import logging | |
import re | |
import gradio as gr | |
import numpy | |
import torch | |
import utils | |
from infer import infer, get_net_g | |
logging.getLogger("numba").setLevel(logging.WARNING) | |
logging.getLogger("markdown_it").setLevel(logging.WARNING) | |
logging.getLogger("urllib3").setLevel(logging.WARNING) | |
logging.getLogger("matplotlib").setLevel(logging.WARNING) | |
logging.basicConfig(level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s") | |
logger = logging.getLogger(__name__) | |
net_g = None | |
hps = None | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model_path = "models/G_1000.pth" | |
sampling_rate = 22050 | |
def split_sentence(sentence: str): | |
if len(sentence) == 0: | |
return [] | |
result = [] | |
is_english = [i.isascii() for i in sentence] | |
is_chinese = [not re.match(r"[a-zA-Z]", i) for i in sentence] | |
assert len(is_english) == len(is_chinese) == len(sentence), "bad length" | |
assert is_english[0] or is_chinese[0], "bad first char: " + sentence[0] | |
current_language = '' | |
current_chain = [] | |
for idx in range(len(sentence)): | |
if not is_english[idx]: | |
current_language = 'ZH' | |
current_chain = is_chinese | |
break | |
if not is_chinese[idx]: | |
current_language = 'EN' | |
current_chain = is_english | |
break | |
pass | |
step = 0 | |
while step < len(sentence): | |
try: | |
next_step = current_chain.index(False, step) | |
except ValueError: | |
next_step = len(sentence) | |
result.append((sentence[step:next_step], current_language)) | |
step = next_step | |
current_language = 'ZH' if current_language == 'EN' else 'EN' | |
current_chain = is_chinese if current_language == 'ZH' else is_english | |
pass | |
return result | |
def tts_fn( | |
text: str, | |
speaker, | |
sdp_ratio, | |
noise_scale, | |
noise_scale_w, | |
length_scale, | |
language, | |
): | |
language = 'ZH' if language == '普通话' else 'SH' | |
sentences = split_sentence(text) | |
silence = numpy.zeros(sampling_rate // 2, dtype=numpy.int16) | |
audio_data = numpy.array([], dtype=numpy.float32) | |
for (sentence, sentence_language) in sentences: | |
sub_audio_data = infer( | |
sentence, | |
sdp_ratio, | |
noise_scale, | |
noise_scale_w, | |
length_scale, | |
sid=speaker, | |
language=language if sentence_language == "ZH" else sentence_language, | |
hps=hps, | |
net_g=net_g, | |
device=device) | |
audio_data = numpy.concatenate((audio_data, sub_audio_data, silence)) | |
audio_data = audio_data / numpy.abs(audio_data).max() | |
audio_data = audio_data * 32767 | |
audio_data = audio_data.astype(numpy.int16) | |
return "Success", (sampling_rate, audio_data) | |
def main(): | |
logging.basicConfig(level=logging.DEBUG) | |
global hps | |
hps = utils.get_hparams_from_file("configs/config.json") | |
global net_g | |
net_g = get_net_g(model_path=model_path, device=device, hps=hps) | |
speaker_ids = hps.data.spk2id | |
speakers = list(speaker_ids.keys()) | |
languages = ["普通话", "上海话"] | |
with gr.Blocks() as app: | |
with gr.Row(): | |
with gr.Column(): | |
text = gr.TextArea( | |
label="输入文本内容", | |
value="\n".join([ | |
"站一个制高点看上海,", | |
"Looking at Shanghai from a commanding height,", | |
"上海的弄堂是壮观的景象。", | |
"The alleys in Shanghai are a great sight.", | |
"它是这城市背景一样的东西。", | |
"It is something with the same background as this city." | |
]), | |
) | |
sdp_ratio = gr.Slider(minimum=0, maximum=1, value=0.2, step=0.1, label="SDP/DP混合比") | |
noise_scale = gr.Slider(minimum=0.1, maximum=2, value=0.6, step=0.1, label="感情") | |
noise_scale_w = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1, label="音素长度") | |
length_scale = gr.Slider(minimum=0.1, maximum=2, value=1.0, step=0.1, label="语速") | |
with gr.Column(): | |
with gr.Row(): | |
with gr.Column(): | |
speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="选择说话人") | |
with gr.Column(): | |
language = gr.Dropdown(choices=languages, value=languages[0], label="选择语言") | |
submit_btn = gr.Button("生成音频", variant="primary") | |
text_output = gr.Textbox(label="状态") | |
audio_output = gr.Audio(label="音频") | |
submit_btn.click( | |
tts_fn, | |
inputs=[ | |
text, | |
speaker, | |
sdp_ratio, | |
noise_scale, | |
noise_scale_w, | |
length_scale, | |
language, | |
], | |
outputs=[text_output, audio_output], | |
) | |
app.launch(share=False, server_name="0.0.0.0", server_port=7860) | |
if __name__ == "__main__": | |
main() | |