Mahiruoshi commited on
Commit
04b3554
·
verified ·
1 Parent(s): 4c4d672

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +512 -1
app.py CHANGED
@@ -1 +1,512 @@
1
- print("hello")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mport argparse
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import logging
6
+ import re_matching
7
+
8
+ logging.getLogger("numba").setLevel(logging.WARNING)
9
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
10
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
11
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+ import shutil
19
+ from scipy.io.wavfile import write
20
+ import librosa
21
+ import numpy as np
22
+ import torch
23
+ import torch.nn as nn
24
+ from torch.utils.data import Dataset
25
+ from torch.utils.data import DataLoader, Dataset
26
+ from tqdm import tqdm
27
+ from tools.sentence import extrac, is_japanese, is_chinese, seconds_to_ass_time, extract_text_from_file, remove_annotations,extract_and_convert
28
+
29
+
30
+ import gradio as gr
31
+
32
+ import utils
33
+ from config import config
34
+
35
+ import torch
36
+ import commons
37
+ from text import cleaned_text_to_sequence, get_bert
38
+ from text.cleaner import clean_text
39
+ import utils
40
+
41
+ from models import SynthesizerTrn
42
+ from text.symbols import symbols
43
+ import sys
44
+ import re
45
+ from tools.translate import translate
46
+
47
+ net_g = None
48
+
49
+ device = (
50
+ "cuda:0"
51
+ if torch.cuda.is_available()
52
+ else (
53
+ "mps"
54
+ if sys.platform == "darwin" and torch.backends.mps.is_available()
55
+ else "cpu"
56
+ )
57
+ )
58
+
59
+ #device = "cpu"
60
+ BandList = {
61
+ "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
62
+ "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
63
+ "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
64
+ "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"],
65
+ "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
66
+ "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
67
+ "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
68
+ "MyGo":["燈","愛音","そよ","立希","楽奈"],
69
+ "AveMujica":["祥子","睦","海鈴","にゃむ","初華"],
70
+ "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"],
71
+ "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"],
72
+ "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"],
73
+ "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
74
+ }
75
+
76
+ def get_net_g(model_path: str, device: str, hps):
77
+ net_g = SynthesizerTrn(
78
+ len(symbols),
79
+ hps.data.filter_length // 2 + 1,
80
+ hps.train.segment_size // hps.data.hop_length,
81
+ n_speakers=hps.data.n_speakers,
82
+ **hps.model,
83
+ ).to(device)
84
+ _ = net_g.eval()
85
+ _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
86
+ return net_g
87
+
88
+ def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7):
89
+ style_text = None if style_text == "" else style_text
90
+ norm_text, phone, tone, word2ph = clean_text(text, language_str)
91
+ phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
92
+
93
+ if hps.data.add_blank:
94
+ phone = commons.intersperse(phone, 0)
95
+ tone = commons.intersperse(tone, 0)
96
+ language = commons.intersperse(language, 0)
97
+ for i in range(len(word2ph)):
98
+ word2ph[i] = word2ph[i] * 2
99
+ word2ph[0] += 1
100
+ bert_ori = get_bert(
101
+ norm_text, word2ph, language_str, device, style_text, style_weight
102
+ )
103
+ del word2ph
104
+ assert bert_ori.shape[-1] == len(phone), phone
105
+
106
+ if language_str == "ZH":
107
+ bert = bert_ori
108
+ ja_bert = torch.randn(1024, len(phone))
109
+ en_bert = torch.randn(1024, len(phone))
110
+ elif language_str == "JP":
111
+ bert = torch.randn(1024, len(phone))
112
+ ja_bert = bert_ori
113
+ en_bert = torch.randn(1024, len(phone))
114
+ elif language_str == "EN":
115
+ bert = torch.randn(1024, len(phone))
116
+ ja_bert = torch.randn(1024, len(phone))
117
+ en_bert = bert_ori
118
+ else:
119
+ raise ValueError("language_str should be ZH, JP or EN")
120
+
121
+ assert bert.shape[-1] == len(
122
+ phone
123
+ ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
124
+
125
+ phone = torch.LongTensor(phone)
126
+ tone = torch.LongTensor(tone)
127
+ language = torch.LongTensor(language)
128
+ return bert, ja_bert, en_bert, phone, tone, language
129
+
130
+
131
+ def infer(
132
+ text,
133
+ sdp_ratio,
134
+ noise_scale,
135
+ noise_scale_w,
136
+ length_scale,
137
+ sid,
138
+ style_text=None,
139
+ style_weight=0.7,
140
+ language = "Auto",
141
+ ):
142
+ if language == "Auto":
143
+ language= 'JP' if is_japanese(text) else 'ZH'
144
+ bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
145
+ text,
146
+ language,
147
+ hps,
148
+ device,
149
+ style_text=style_text,
150
+ style_weight=style_weight,
151
+ )
152
+ with torch.no_grad():
153
+ x_tst = phones.to(device).unsqueeze(0)
154
+ tones = tones.to(device).unsqueeze(0)
155
+ lang_ids = lang_ids.to(device).unsqueeze(0)
156
+ bert = bert.to(device).unsqueeze(0)
157
+ ja_bert = ja_bert.to(device).unsqueeze(0)
158
+ en_bert = en_bert.to(device).unsqueeze(0)
159
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
160
+ # emo = emo.to(device).unsqueeze(0)
161
+ del phones
162
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
163
+ audio = (
164
+ net_g.infer(
165
+ x_tst,
166
+ x_tst_lengths,
167
+ speakers,
168
+ tones,
169
+ lang_ids,
170
+ bert,
171
+ ja_bert,
172
+ en_bert,
173
+ sdp_ratio=sdp_ratio,
174
+ noise_scale=noise_scale,
175
+ noise_scale_w=noise_scale_w,
176
+ length_scale=length_scale,
177
+ )[0][0, 0]
178
+ .data.cpu()
179
+ .float()
180
+ .numpy()
181
+ )
182
+ del (
183
+ x_tst,
184
+ tones,
185
+ lang_ids,
186
+ bert,
187
+ x_tst_lengths,
188
+ speakers,
189
+ ja_bert,
190
+ en_bert,
191
+ ) # , emo
192
+ if torch.cuda.is_available():
193
+ torch.cuda.empty_cache()
194
+ return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
195
+
196
+ def is_japanese(string):
197
+ for ch in string:
198
+ if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
199
+ return True
200
+ return False
201
+
202
+ def loadmodel(model):
203
+ _ = net_g.eval()
204
+ _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
205
+ return "success"
206
+
207
+ def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
208
+ audio_fin = []
209
+ ass_entries = []
210
+ start_time = 0
211
+ #speaker = random.choice(cara_list)
212
+ ass_header = """[Script Info]
213
+ ; 我没意见
214
+ Title: Audiobook
215
+ ScriptType: v4.00+
216
+ WrapStyle: 0
217
+ PlayResX: 640
218
+ PlayResY: 360
219
+ ScaledBorderAndShadow: yes
220
+ [V4+ Styles]
221
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
222
+ Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
223
+ [Events]
224
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
225
+ """
226
+
227
+ for sentence in group:
228
+ try:
229
+ FakeSpeaker = sentence.split("|")[0]
230
+ print(FakeSpeaker)
231
+ SpeakersList = re.split('\n', spealerList)
232
+ if FakeSpeaker in list(hps.data.spk2id.keys()):
233
+ speaker = FakeSpeaker
234
+ for i in SpeakersList:
235
+ if FakeSpeaker == i.split("|")[1]:
236
+ speaker = i.split("|")[0]
237
+ if sentence != '\n':
238
+ audio = infer_simple((remove_annotations(sentence.split("|")[-1]).replace(" ","")+"。").replace(",。","。").replace("。。","。"), sdp_ratio, noise_scale, noise_scale_w, length_scale,speaker)
239
+ silence_frames = int(silenceTime * 44010) if is_chinese(sentence) else int(silenceTime * 44010)
240
+ silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
241
+ audio_fin.append(audio)
242
+ audio_fin.append(silence_data)
243
+
244
+ duration = len(audio) / sampling_rate
245
+ print(duration)
246
+ end_time = start_time + duration + silenceTime
247
+ ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":")))
248
+ start_time = end_time
249
+ except:
250
+ pass
251
+ wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
252
+ ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
253
+
254
+ write(wav_filename, sampling_rate, np.concatenate(audio_fin))
255
+
256
+ with open(ass_filename, 'w', encoding='utf-8') as f:
257
+ f.write(ass_header + '\n'.join(ass_entries))
258
+ return (hps.data.sampling_rate, np.concatenate(audio_fin))
259
+
260
+ def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath,raw_text):
261
+ directory_path = filepath if torch.cuda.is_available() else "books"
262
+
263
+ if os.path.exists(directory_path):
264
+ shutil.rmtree(directory_path)
265
+
266
+ os.makedirs(directory_path)
267
+ if inputFile:
268
+ text = extract_text_from_file(inputFile.name)
269
+ else:
270
+ text = raw_text
271
+ sentences = extrac(extract_and_convert(text))
272
+ GROUP_SIZE = groupsize
273
+ for i in range(0, len(sentences), GROUP_SIZE):
274
+ group = sentences[i:i+GROUP_SIZE]
275
+ if spealerList == "":
276
+ spealerList = "无"
277
+ result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
278
+ if not torch.cuda.is_available():
279
+ return result
280
+ return result
281
+
282
+ def infer_simple(
283
+ text,
284
+ sdp_ratio,
285
+ noise_scale,
286
+ noise_scale_w,
287
+ length_scale,
288
+ sid,
289
+ style_text=None,
290
+ style_weight=0.7,
291
+ ):
292
+ if is_chinese(text) or is_japanese(text):
293
+ if len(text) > 1:
294
+ language= 'JP' if is_japanese(text) else 'ZH'
295
+ bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
296
+ text,
297
+ language,
298
+ hps,
299
+ device,
300
+ style_text="",
301
+ style_weight=0,
302
+ )
303
+ with torch.no_grad():
304
+ x_tst = phones.to(device).unsqueeze(0)
305
+ tones = tones.to(device).unsqueeze(0)
306
+ lang_ids = lang_ids.to(device).unsqueeze(0)
307
+ bert = bert.to(device).unsqueeze(0)
308
+ ja_bert = ja_bert.to(device).unsqueeze(0)
309
+ en_bert = en_bert.to(device).unsqueeze(0)
310
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
311
+ # emo = emo.to(device).unsqueeze(0)
312
+ del phones
313
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
314
+ audio = (
315
+ net_g.infer(
316
+ x_tst,
317
+ x_tst_lengths,
318
+ speakers,
319
+ tones,
320
+ lang_ids,
321
+ bert,
322
+ ja_bert,
323
+ en_bert,
324
+ sdp_ratio=sdp_ratio,
325
+ noise_scale=noise_scale,
326
+ noise_scale_w=noise_scale_w,
327
+ length_scale=length_scale,
328
+ )[0][0, 0]
329
+ .data.cpu()
330
+ .float()
331
+ .numpy()
332
+ )
333
+ del (
334
+ x_tst,
335
+ tones,
336
+ lang_ids,
337
+ bert,
338
+ x_tst_lengths,
339
+ speakers,
340
+ ja_bert,
341
+ en_bert,
342
+ ) # , emo
343
+ if torch.cuda.is_available():
344
+ torch.cuda.empty_cache()
345
+ return audio
346
+
347
+ if __name__ == "__main__":
348
+ languages = [ "Auto", "ZH", "JP"]
349
+ modelPaths = []
350
+ for dirpath, dirnames, filenames in os.walk('Data/Chinese/models/'):
351
+ for filename in filenames:
352
+ modelPaths.append(os.path.join(dirpath, filename))
353
+ hps = utils.get_hparams_from_file('Data/Chinese/config.json')
354
+ net_g = get_net_g(
355
+ model_path="Data/Chinese/models/G_80000.pth", device=device, hps=hps
356
+ )
357
+ speaker_ids = hps.data.spk2id
358
+ speakers = list(speaker_ids.keys())
359
+ with gr.Blocks() as app:
360
+ gr.Markdown(value="""
361
+ [日语特化版(推荐)](https://huggingface.co/spaces/Mahiruoshi/BangStarlight),国内可用连接: https://mahiruoshi-BangStarlight.hf.space/\n
362
+ [假名标注版](https://huggingface.co/spaces/Mahiruoshi/MyGO_VIts-bert),国内可用连接: https://mahiruoshi-MyGO-VIts-bert.hf.space/\n
363
+ 该界面的真实链接(国内可用): https://mahiruoshi-bangdream-bert-vits2.hf.space/\n
364
+ ([Bert-Vits2](https://github.com/Stardust-minus/Bert-VITS2) V2.3)少歌邦邦全员在线语音合成\n
365
+ [好玩的](http://love.soyorin.top/)\n
366
+ API: https://mahiruoshi-bert-vits2-api.hf.space/ \n
367
+ 调用方式: https://mahiruoshi-bert-vits2-api.hf.space/?text={{speakText}}&speaker=chosen_speaker\n
368
+ 推荐搭配[Legado开源阅读](https://github.com/gedoor/legado)或[聊天bot](https://github.com/Paraworks/BangDreamAi)使用\n
369
+ 二创请标注作者:B站@Mahiroshi: https://space.bilibili.com/19874615\n
370
+ 训练数据集归属:BangDream及少歌手游,提取自BestDori,[数据集获取流程](https://nijigaku.top/2023/09/29/Bestbushiroad%E8%AE%A1%E5%88%92-vits-%E9%9F%B3%E9%A2%91%E6%8A%93%E5%8F%96%E5%8F%8A%E6%95%B0%E6%8D%AE%E9%9B%86%E5%AF%B9%E9%BD%90/)\n
371
+ BangDream数据集下载[链接](https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/%E7%88%AC%E8%99%AB/SortPathUrl.txt)\n
372
+ !!!注意:huggingface容器仅用作展示,建议在右上角更多选项中克隆本项目或Docker运行app.py/server.py,环境参考requirements.txt\n""")
373
+ for band in BandList:
374
+ with gr.TabItem(band):
375
+ for name in BandList[band]:
376
+ with gr.TabItem(name):
377
+ with gr.Row():
378
+ with gr.Column():
379
+ with gr.Row():
380
+ gr.Markdown(
381
+ '<div align="center">'
382
+ f'<img style="width:auto;height:400px;" src="https://mahiruoshi-bangdream-bert-vits2.hf.space/file/image/{name}.png">'
383
+ '</div>'
384
+ )
385
+ length_scale = gr.Slider(
386
+ minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
387
+ )
388
+ language = gr.Dropdown(
389
+ choices=languages, value="Auto", label="语言"
390
+ )
391
+ with gr.Accordion(label="参数设定", open=True):
392
+ sdp_ratio = gr.Slider(
393
+ minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
394
+ )
395
+ noise_scale = gr.Slider(
396
+ minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
397
+ )
398
+ noise_scale_w = gr.Slider(
399
+ minimum=0.1, maximum=2, value=0.667, step=0.01, label="音素长度"
400
+ )
401
+ speaker = gr.Dropdown(
402
+ choices=speakers, value=name, label="说话人"
403
+ )
404
+ with gr.Accordion(label="切换模型", open=False):
405
+ modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
406
+ btnMod = gr.Button("载入模型")
407
+ statusa = gr.TextArea(label = "模型加载状态")
408
+ btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
409
+ with gr.Column():
410
+ text = gr.TextArea(
411
+ label="文本输入",
412
+ info="输入纯日语或者中文",
413
+ value="我是来结束这个乐队的。",
414
+ )
415
+ style_text = gr.Textbox(
416
+ label="情感辅助文本",
417
+ info="语言保持跟主文本一致,文本可以参考训练集:https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/filelists/Mygo.list)",
418
+ placeholder="使用辅助文本的语意来辅助生成对话(语言保持与主文本相同)\n\n"
419
+ "**注意**:不要使用**指令式文本**(如:开心),要使用**带有强烈情感的文本**(如:我好快乐!!!)"
420
+ )
421
+ style_weight = gr.Slider(
422
+ minimum=0,
423
+ maximum=1,
424
+ value=0.7,
425
+ step=0.1,
426
+ label="Weight",
427
+ info="主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本",
428
+ )
429
+ btn = gr.Button("点击生成", variant="primary")
430
+ audio_output = gr.Audio(label="Output Audio")
431
+ btntran = gr.Button("快速中翻日")
432
+ translateResult = gr.TextArea(label="使用百度翻译",placeholder="从这里复制翻译后的文本")
433
+ btntran.click(translate, inputs=[text], outputs = [translateResult])
434
+
435
+ btn.click(
436
+ infer,
437
+ inputs=[
438
+ text,
439
+ sdp_ratio,
440
+ noise_scale,
441
+ noise_scale_w,
442
+ length_scale,
443
+ speaker,
444
+ style_text,
445
+ style_weight,
446
+ language,
447
+ ],
448
+ outputs=[audio_output],
449
+ )
450
+ with gr.Tab('拓展功能'):
451
+ with gr.Row():
452
+ with gr.Column():
453
+ gr.Markdown(
454
+ f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
455
+ )
456
+ inputFile = gr.UploadButton(label="txt文件输入")
457
+ raw_text = gr.TextArea(
458
+ label="文本输入",
459
+ info="输入纯日语或者中文",
460
+ value="つくし|我是来结束这个乐队���。",
461
+ )
462
+ groupSize = gr.Slider(
463
+ minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
464
+ )
465
+ silenceTime = gr.Slider(
466
+ minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"
467
+ )
468
+ filepath = gr.TextArea(
469
+ label="本地合成时的音频存储文件夹(会清空文件夹)",
470
+ value = "D:/audiobook/book1",
471
+ )
472
+ spealerList = gr.TextArea(
473
+ label="角色对应表,左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList}|{SeakerInUploadText}",
474
+ placeholder = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
475
+ )
476
+ speaker = gr.Dropdown(
477
+ choices=speakers, value = "ましろ", label="选择默认说话人"
478
+ )
479
+ with gr.Column():
480
+ sdp_ratio = gr.Slider(
481
+ minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
482
+ )
483
+ noise_scale = gr.Slider(
484
+ minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
485
+ )
486
+ noise_scale_w = gr.Slider(
487
+ minimum=0.1, maximum=2, value=0.667, step=0.01, label="音素长度"
488
+ )
489
+ length_scale = gr.Slider(
490
+ minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
491
+ )
492
+ LastAudioOutput = gr.Audio(label="当使用cuda时才能在本地文件夹浏览全部文件")
493
+ btn2 = gr.Button("点击生成", variant="primary")
494
+ btn2.click(
495
+ audiobook,
496
+ inputs=[
497
+ inputFile,
498
+ groupSize,
499
+ speaker,
500
+ sdp_ratio,
501
+ noise_scale,
502
+ noise_scale_w,
503
+ length_scale,
504
+ spealerList,
505
+ silenceTime,
506
+ filepath,
507
+ raw_text
508
+ ],
509
+ outputs=[LastAudioOutput],
510
+ )
511
+ print("推理页面已开启!")
512
+ app.launch(share=True)