|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import sys |
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) |
|
|
|
import argparse |
|
import gradio as gr |
|
import numpy as np |
|
import torch |
|
torch.set_num_threads(1) |
|
import torchaudio |
|
import random |
|
import librosa |
|
from transformers import pipeline |
|
import subprocess |
|
from scipy.signal import resample |
|
|
|
import logging |
|
logging.getLogger('matplotlib').setLevel(logging.WARNING) |
|
|
|
from cosyvoice.cli.cosyvoice import CosyVoice |
|
from cosyvoice.utils.file_utils import load_wav, speed_change |
|
|
|
|
|
|
|
|
|
def generate_seed(): |
|
seed = random.randint(1, 100000000) |
|
return { |
|
"__type__": "update", |
|
"value": seed |
|
} |
|
|
|
def set_all_random_seed(seed): |
|
random.seed(seed) |
|
np.random.seed(seed) |
|
torch.manual_seed(seed) |
|
torch.cuda.manual_seed_all(seed) |
|
|
|
max_val = 0.8 |
|
def postprocess(speech, top_db=60, hop_length=220, win_length=440): |
|
speech, _ = librosa.effects.trim( |
|
speech, top_db=top_db, |
|
frame_length=win_length, |
|
hop_length=hop_length |
|
) |
|
if speech.abs().max() > max_val: |
|
speech = speech / speech.abs().max() * max_val |
|
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) |
|
return speech |
|
|
|
def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, select_which): |
|
if select_which == "上傳檔案" and prompt_wav_upload is not None: |
|
prompt_wav = prompt_wav_upload |
|
elif select_which == "麥克風" and prompt_wav_record is not None: |
|
prompt_wav = prompt_wav_record |
|
else: |
|
prompt_wav = None |
|
|
|
|
|
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) |
|
set_all_random_seed(seed) |
|
output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k) |
|
speed_factor = 1 |
|
if speed_factor != 1.0: |
|
|
|
|
|
|
|
new_length = int(len(output['tts_speech']) / speed_factor) |
|
audio_data = resample(output['tts_speech'], new_length) |
|
|
|
|
|
else: |
|
audio_data = output['tts_speech'].numpy().flatten() |
|
|
|
return (target_sr, audio_data) |
|
|
|
|
|
def generate_text(prompt_wav_upload, prompt_wav_record, select_which): |
|
|
|
if select_which == "上傳檔案" and prompt_wav_upload is not None: |
|
prompt_wav = prompt_wav_upload |
|
LAST_UPLOADED = "upload" |
|
elif select_which == "麥克風" and prompt_wav_record is not None: |
|
prompt_wav = prompt_wav_record |
|
LAST_UPLOADED = "record" |
|
else: |
|
prompt_wav = None |
|
LAST_UPLOADED = None |
|
print(select_which) |
|
|
|
if prompt_wav: |
|
results = asr_pipeline(prompt_wav) |
|
return results['text'] |
|
return "No valid input detected." |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def demo_get_audio(tts_text): |
|
sample_wav = 'sample.wav' |
|
speech, sample_rate = torchaudio.load(sample_wav) |
|
|
|
return sample_rate, speech |
|
def main(): |
|
with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo: |
|
|
|
gr.Markdown("# BreezyVoice 語音合成系統") |
|
gr.Markdown( |
|
"""### 僅需5秒語音樣本,就可輸出擬真人聲。""" |
|
) |
|
with gr.Row(): |
|
gr.Image(value="https://huggingface.co/spaces/Splend1dchan/BreezyVoice-Playground/resolve/main/flowchart.png", interactive=False, scale=3) |
|
gr.Markdown( |
|
"""#### 此沙盒使用 Huggingface CPU,請預期大於200 秒的推理時間,您可以考慮以下方法加速: |
|
1. **強烈建議**複製這個 Space(Duplicate this space),以分散流量! |
|
2. 複製至本地GPU執行(請參考[指南](https://huggingface.co/docs/hub/en/spaces-overview))或使用[kaggle](https://www.kaggle.com/code/a24998667/breezyvoice-playground) |
|
3. 複製至本地CPU執行(請參考[指南](https://huggingface.co/docs/hub/en/spaces-overview)) |
|
|
|
為了加快推理速度,g2pw注音標註並未被啟動。 |
|
|
|
免責聲明:此沙盒在一次性容器地端執行,關閉後檔案將遭到刪除。此沙盒不屬於聯發創新基地,聯發創新基地無法獲得任何使用者輸入。""" |
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入") |
|
gr.Markdown("選擇prompt音訊檔案或錄製prompt音訊 (5~15秒),並手動校對自動產生的音訊樣本文本。") |
|
prompt_wav_upload = gr.Audio( |
|
sources='upload', |
|
type='filepath', |
|
label='選擇prompt音訊檔案(確保取樣率不低於16khz)' |
|
) |
|
prompt_wav_record = gr.Audio( |
|
sources='microphone', |
|
type='filepath', |
|
label='錄製prompt音訊檔案' |
|
) |
|
|
|
with gr.Blocks(): |
|
select_which = gr.Radio(["上傳檔案", "麥克風"], label="音訊來源", interactive=True ) |
|
with gr.Blocks(): |
|
prompt_text = gr.Textbox( |
|
label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)", |
|
lines=2, |
|
placeholder="音訊樣本文本" |
|
) |
|
|
|
|
|
def a(X): |
|
return "上傳檔案" |
|
prompt_wav_upload.change( |
|
fn=a, |
|
inputs=[prompt_wav_upload], |
|
outputs=select_which |
|
) |
|
|
|
|
|
|
|
|
|
|
|
prompt_wav_record.change( |
|
fn=lambda recording: "麥克風", |
|
inputs=[prompt_wav_record], |
|
outputs=select_which |
|
) |
|
|
|
select_which.change( |
|
fn=generate_text, |
|
inputs=[prompt_wav_upload, prompt_wav_record, select_which], |
|
outputs=prompt_text |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown("### 步驟 2.合成文本輸入") |
|
tts_text = gr.Textbox( |
|
label="輸入想要合成的文本", |
|
lines=2, |
|
placeholder="請輸入想要合成的文本...", |
|
value="你好,歡迎光臨" |
|
) |
|
|
|
|
|
|
|
gr.Markdown("### 步驟 3. 合成音訊") |
|
|
|
|
|
with gr.Accordion("進階設定", open=False): |
|
seed = gr.Number(value=0, label="隨機推理種子") |
|
|
|
seed_button = gr.Button(value="\U0001F3B2生成隨機推理種子\U0001F3B2") |
|
speed_factor = 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
generate_button = gr.Button("生成音訊") |
|
audio_output = gr.Audio(label="合成音訊") |
|
|
|
|
|
seed_button.click(fn=generate_seed, inputs=[], outputs=seed) |
|
generate_button.click( |
|
fn=generate_audio, |
|
inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, select_which], |
|
outputs=audio_output |
|
) |
|
|
|
demo.queue(max_size=10, default_concurrency_limit=1) |
|
demo.launch() |
|
|
|
if __name__ == '__main__': |
|
cosyvoice = CosyVoice('Splend1dchan/BreezyVoice') |
|
asr_pipeline = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-tiny", |
|
tokenizer="openai/whisper-tiny", |
|
device=0 |
|
) |
|
sft_spk = cosyvoice.list_avaliable_spks() |
|
prompt_sr, target_sr = 16000, 22050 |
|
default_data = np.zeros(target_sr) |
|
main() |
|
|