Spaces:

Mahiruoshi
/

BangDream-Bert-VITS2-OUTDATE

Runtime error

App Files Files Community

Mahiruoshi commited on 11 days ago

Commit

04b3554

verified ·

1 Parent(s): 4c4d672

Update app.py

Browse files

Files changed (1) hide show

app.py +512 -1

app.py CHANGED Viewed

	@@ -1 +1,512 @@
1	- ~~print("hello")~~

+mport argparse
+import os
+from pathlib import Path
+import logging
+import re_matching
+logging.getLogger("numba").setLevel(logging.WARNING)
+logging.getLogger("markdown_it").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+logging.getLogger("matplotlib").setLevel(logging.WARNING)
+logging.basicConfig(
+    level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
+)
+logger = logging.getLogger(__name__)
+import shutil
+from scipy.io.wavfile import write
+import librosa
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from tools.sentence import extrac, is_japanese, is_chinese, seconds_to_ass_time, extract_text_from_file, remove_annotations,extract_and_convert
+import gradio as gr
+import utils
+from config import config
+import torch
+import commons
+from text import cleaned_text_to_sequence, get_bert
+from text.cleaner import clean_text
+import utils
+from models import SynthesizerTrn
+from text.symbols import symbols
+import sys
+import re
+from tools.translate import translate
+net_g = None
+device = (
+        "cuda:0"
+        if torch.cuda.is_available()
+        else (
+            "mps"
+            if sys.platform == "darwin" and torch.backends.mps.is_available()
+            else "cpu"
+        )
+    )
+#device = "cpu"
+BandList = {
+        "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
+        "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
+        "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
+        "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"],
+        "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
+        "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
+        "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
+        "MyGo":["燈","愛音","そよ","立希","楽奈"],
+        "AveMujica":["祥子","睦","海鈴","にゃむ","初華"],
+        "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"],
+        "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"],
+        "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"],
+        "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
+}
+def get_net_g(model_path: str,  device: str, hps):
+    net_g = SynthesizerTrn(
+        len(symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model,
+    ).to(device)
+    _ = net_g.eval()
+    _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
+    return net_g
+def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7):
+    style_text = None if style_text == "" else style_text
+    norm_text, phone, tone, word2ph = clean_text(text, language_str)
+    phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    if hps.data.add_blank:
+        phone = commons.intersperse(phone, 0)
+        tone = commons.intersperse(tone, 0)
+        language = commons.intersperse(language, 0)
+        for i in range(len(word2ph)):
+            word2ph[i] = word2ph[i] * 2
+        word2ph[0] += 1
+    bert_ori = get_bert(
+        norm_text, word2ph, language_str, device, style_text, style_weight
+    )
+    del word2ph
+    assert bert_ori.shape[-1] == len(phone), phone
+    if language_str == "ZH":
+        bert = bert_ori
+        ja_bert = torch.randn(1024, len(phone))
+        en_bert = torch.randn(1024, len(phone))
+    elif language_str == "JP":
+        bert = torch.randn(1024, len(phone))
+        ja_bert = bert_ori
+        en_bert = torch.randn(1024, len(phone))
+    elif language_str == "EN":
+        bert = torch.randn(1024, len(phone))
+        ja_bert = torch.randn(1024, len(phone))
+        en_bert = bert_ori
+    else:
+        raise ValueError("language_str should be ZH, JP or EN")
+    assert bert.shape[-1] == len(
+        phone
+    ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
+    phone = torch.LongTensor(phone)
+    tone = torch.LongTensor(tone)
+    language = torch.LongTensor(language)
+    return bert, ja_bert, en_bert, phone, tone, language
+def infer(
+    text,
+    sdp_ratio,
+    noise_scale,
+    noise_scale_w,
+    length_scale,
+    sid,
+    style_text=None,
+    style_weight=0.7,
+    language = "Auto",
+):
+    if language == "Auto":
+        language= 'JP' if is_japanese(text) else 'ZH'
+    bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
+        text,
+        language,
+        hps,
+        device,
+        style_text=style_text,
+        style_weight=style_weight,
+    )
+    with torch.no_grad():
+        x_tst = phones.to(device).unsqueeze(0)
+        tones = tones.to(device).unsqueeze(0)
+        lang_ids = lang_ids.to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        ja_bert = ja_bert.to(device).unsqueeze(0)
+        en_bert = en_bert.to(device).unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        # emo = emo.to(device).unsqueeze(0)
+        del phones
+        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
+        audio = (
+            net_g.infer(
+                x_tst,
+                x_tst_lengths,
+                speakers,
+                tones,
+                lang_ids,
+                bert,
+                ja_bert,
+                en_bert,
+                sdp_ratio=sdp_ratio,
+                noise_scale=noise_scale,
+                noise_scale_w=noise_scale_w,
+                length_scale=length_scale,
+            )[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )
+        del (
+            x_tst,
+            tones,
+            lang_ids,
+            bert,
+            x_tst_lengths,
+            speakers,
+            ja_bert,
+            en_bert,
+        )  # , emo
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
+def is_japanese(string):
+        for ch in string:
+            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
+                return True
+        return False
+def loadmodel(model):
+    _ = net_g.eval()
+    _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
+    return "success"
+def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
+    audio_fin = []
+    ass_entries = []
+    start_time = 0
+    #speaker = random.choice(cara_list)
+    ass_header = """[Script Info]
+; 我没意见
+Title: Audiobook
+ScriptType: v4.00+
+WrapStyle: 0
+PlayResX: 640
+PlayResY: 360
+ScaledBorderAndShadow: yes
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+    for sentence in group:
+        try:
+            FakeSpeaker = sentence.split("|")[0]
+            print(FakeSpeaker)
+            SpeakersList = re.split('\n', spealerList)
+            if FakeSpeaker in list(hps.data.spk2id.keys()):
+                speaker = FakeSpeaker
+            for i in SpeakersList:
+                if FakeSpeaker == i.split("|")[1]:
+                    speaker = i.split("|")[0]
+            if sentence != '\n':
+                audio = infer_simple((remove_annotations(sentence.split("|")[-1]).replace(" ","")+"。").replace("，。","。").replace("。。","。"), sdp_ratio, noise_scale, noise_scale_w, length_scale,speaker)
+                silence_frames = int(silenceTime * 44010) if is_chinese(sentence) else int(silenceTime * 44010)
+                silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
+                audio_fin.append(audio)
+                audio_fin.append(silence_data)
+                duration = len(audio) / sampling_rate
+                print(duration)
+                end_time = start_time + duration + silenceTime
+                ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|","：")))
+                start_time = end_time
+        except:
+            pass
+    wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
+    ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
+    write(wav_filename, sampling_rate, np.concatenate(audio_fin))
+    with open(ass_filename, 'w', encoding='utf-8') as f:
+        f.write(ass_header + '\n'.join(ass_entries))
+    return (hps.data.sampling_rate, np.concatenate(audio_fin))
+def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath,raw_text):
+    directory_path = filepath if torch.cuda.is_available() else "books"
+    if os.path.exists(directory_path):
+        shutil.rmtree(directory_path)
+    os.makedirs(directory_path)
+    if inputFile:
+        text = extract_text_from_file(inputFile.name)
+    else:
+        text = raw_text
+    sentences = extrac(extract_and_convert(text))
+    GROUP_SIZE = groupsize
+    for i in range(0, len(sentences), GROUP_SIZE):
+        group = sentences[i:i+GROUP_SIZE]
+        if spealerList == "":
+            spealerList = "无"
+        result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
+        if not torch.cuda.is_available():
+            return result
+    return result
+def infer_simple(
+    text,
+    sdp_ratio,
+    noise_scale,
+    noise_scale_w,
+    length_scale,
+    sid,
+    style_text=None,
+    style_weight=0.7,
+):
+    if is_chinese(text) or is_japanese(text):
+        if len(text) > 1:
+            language= 'JP' if is_japanese(text) else 'ZH'
+            bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
+                text,
+                language,
+                hps,
+                device,
+                style_text="",
+                style_weight=0,
+            )
+            with torch.no_grad():
+                x_tst = phones.to(device).unsqueeze(0)
+                tones = tones.to(device).unsqueeze(0)
+                lang_ids = lang_ids.to(device).unsqueeze(0)
+                bert = bert.to(device).unsqueeze(0)
+                ja_bert = ja_bert.to(device).unsqueeze(0)
+                en_bert = en_bert.to(device).unsqueeze(0)
+                x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+                # emo = emo.to(device).unsqueeze(0)
+                del phones
+                speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
+                audio = (
+                    net_g.infer(
+                        x_tst,
+                        x_tst_lengths,
+                        speakers,
+                        tones,
+                        lang_ids,
+                        bert,
+                        ja_bert,
+                        en_bert,
+                        sdp_ratio=sdp_ratio,
+                        noise_scale=noise_scale,
+                        noise_scale_w=noise_scale_w,
+                        length_scale=length_scale,
+                    )[0][0, 0]
+                    .data.cpu()
+                    .float()
+                    .numpy()
+                )
+                del (
+                    x_tst,
+                    tones,
+                    lang_ids,
+                    bert,
+                    x_tst_lengths,
+                    speakers,
+                    ja_bert,
+                    en_bert,
+                )  # , emo
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                return audio
+if __name__ == "__main__":
+    languages = [ "Auto", "ZH", "JP"]
+    modelPaths = []
+    for dirpath, dirnames, filenames in os.walk('Data/Chinese/models/'):
+        for filename in filenames:
+            modelPaths.append(os.path.join(dirpath, filename))
+    hps = utils.get_hparams_from_file('Data/Chinese/config.json')
+    net_g = get_net_g(
+        model_path="Data/Chinese/models/G_80000.pth", device=device, hps=hps
+    )
+    speaker_ids = hps.data.spk2id
+    speakers = list(speaker_ids.keys())
+    with gr.Blocks() as app:
+        gr.Markdown(value="""
+            [日语特化版(推荐)](https://huggingface.co/spaces/Mahiruoshi/BangStarlight),国内可用连接: https://mahiruoshi-BangStarlight.hf.space/\n
+            [假名标注版](https://huggingface.co/spaces/Mahiruoshi/MyGO_VIts-bert)，国内可用连接: https://mahiruoshi-MyGO-VIts-bert.hf.space/\n
+            该界面的真实链接(国内可用): https://mahiruoshi-bangdream-bert-vits2.hf.space/\n
+            ([Bert-Vits2](https://github.com/Stardust-minus/Bert-VITS2) V2.3)少歌邦邦全员在线语音合成\n
+            [好玩的](http://love.soyorin.top/)\n
+            API: https://mahiruoshi-bert-vits2-api.hf.space/ \n
+            调用方式: https://mahiruoshi-bert-vits2-api.hf.space/?text={{speakText}}&speaker=chosen_speaker\n
+            推荐搭配[Legado开源阅读](https://github.com/gedoor/legado)或[聊天bot](https://github.com/Paraworks/BangDreamAi)使用\n
+            二创请标注作者：B站@Mahiroshi: https://space.bilibili.com/19874615\n
+            训练数据集归属：BangDream及少歌手游,提取自BestDori,[数据集获取流程](https://nijigaku.top/2023/09/29/Bestbushiroad%E8%AE%A1%E5%88%92-vits-%E9%9F%B3%E9%A2%91%E6%8A%93%E5%8F%96%E5%8F%8A%E6%95%B0%E6%8D%AE%E9%9B%86%E5%AF%B9%E9%BD%90/)\n
+            BangDream数据集下载[链接](https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/%E7%88%AC%E8%99%AB/SortPathUrl.txt)\n
+            ！！！注意：huggingface容器仅用作展示，建议在右上角更多选项中克隆本项目或Docker运行app.py/server.py,环境参考requirements.txt\n""")
+        for band in BandList:
+            with gr.TabItem(band):
+                for name in BandList[band]:
+                    with gr.TabItem(name):
+                        with gr.Row():
+                            with gr.Column():
+                                with gr.Row():
+                                    gr.Markdown(
+                                        '<div align="center">'
+                                        f'<img style="width:auto;height:400px;" src="https://mahiruoshi-bangdream-bert-vits2.hf.space/file/image/{name}.png">'
+                                        '</div>'
+                                    )
+                                length_scale = gr.Slider(
+                                        minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
+                                    )
+                                language = gr.Dropdown(
+                                        choices=languages, value="Auto", label="语言"
+                                    )
+                                with gr.Accordion(label="参数设定", open=True):
+                                    sdp_ratio = gr.Slider(
+                                    minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
+                                    )
+                                    noise_scale = gr.Slider(
+                                        minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
+                                    )
+                                    noise_scale_w = gr.Slider(
+                                        minimum=0.1, maximum=2, value=0.667, step=0.01, label="音素长度"
+                                    )
+                                    speaker = gr.Dropdown(
+                                        choices=speakers, value=name, label="说话人"
+                                    )
+                                with gr.Accordion(label="切换模型", open=False):
+                                    modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
+                                    btnMod = gr.Button("载入模型")
+                                    statusa = gr.TextArea(label = "模型加载状态")
+                                    btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
+                            with gr.Column():
+                                text = gr.TextArea(
+                                    label="文本输入",
+                                    info="输入纯日语或者中文",
+                                    value="我是来结束这个乐队的。",
+                                )
+                                style_text = gr.Textbox(
+                                    label="情感辅助文本",
+                                    info="语言保持跟主文本一致,文本可以参考训练集:https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/filelists/Mygo.list)",
+                                    placeholder="使用辅助文本的语意来辅助生成对话（语言保持与主文本相同）\n\n"
+                        "**注意**：不要使用**指令式文本**（如：开心），要使用**带有强烈情感的文本**（如：我好快乐！！！）"
+                                                        )
+                                style_weight = gr.Slider(
+                                        minimum=0,
+                                        maximum=1,
+                                        value=0.7,
+                                        step=0.1,
+                                        label="Weight",
+                                        info="主文本和辅助文本的bert混合比率，0表示仅主文本，1表示仅辅助文本",
+                                    )
+                                btn = gr.Button("点击生成", variant="primary")
+                                audio_output = gr.Audio(label="Output Audio")
+                                btntran = gr.Button("快速中翻日")
+                                translateResult = gr.TextArea(label="使用百度翻译",placeholder="从这里复制翻译后的文本")
+                                btntran.click(translate, inputs=[text], outputs = [translateResult])
+                    btn.click(
+                        infer,
+                        inputs=[
+                            text,
+                            sdp_ratio,
+                            noise_scale,
+                            noise_scale_w,
+                            length_scale,
+                            speaker,
+                            style_text,
+                            style_weight,
+                            language,
+                        ],
+                        outputs=[audio_output],
+                    )
+        with gr.Tab('拓展功能'):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown(
+                                    f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
+                                )
+                    inputFile = gr.UploadButton(label="txt文件输入")
+                    raw_text = gr.TextArea(
+                                        label="文本输入",
+                                        info="输入纯日语或者中文",
+                                        value="つくし|我是来结束这个乐队���。",
+                    )
+                    groupSize = gr.Slider(
+                    minimum=10, maximum=1000 if  torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
+                    )
+                    silenceTime = gr.Slider(
+                    minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"
+                    )
+                    filepath = gr.TextArea(
+                                        label="本地合成时的音频存储文件夹(会清空文件夹)",
+                                        value = "D:/audiobook/book1",
+                    )
+                    spealerList = gr.TextArea(
+                                        label="角色对应表,左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList}|{SeakerInUploadText}",
+                                        placeholder = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
+                    )
+                    speaker = gr.Dropdown(
+                        choices=speakers, value = "ましろ", label="选择默认说话人"
+                    )
+                with gr.Column():
+                    sdp_ratio = gr.Slider(
+                    minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
+                    )
+                    noise_scale = gr.Slider(
+                        minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
+                    )
+                    noise_scale_w = gr.Slider(
+                        minimum=0.1, maximum=2, value=0.667, step=0.01, label="音素长度"
+                    )
+                    length_scale = gr.Slider(
+                        minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
+                    )
+                    LastAudioOutput = gr.Audio(label="当使用cuda时才能在本地文件夹浏览全部文件")
+                    btn2 = gr.Button("点击生成", variant="primary")
+                btn2.click(
+                    audiobook,
+                    inputs=[
+                        inputFile,
+                        groupSize,
+                        speaker,
+                        sdp_ratio,
+                        noise_scale,
+                        noise_scale_w,
+                        length_scale,
+                        spealerList,
+                        silenceTime,
+                        filepath,
+                        raw_text
+                    ],
+                    outputs=[LastAudioOutput],
+                )
+    print("推理页面已开启!")
+    app.launch(share=True)