import torch from transformers import pipeline import numpy as np import gradio as gr device = torch.device("cuda" if torch.cuda.is_available() else "cpu") pipe_dict = { "original_pipe": pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0), } # Inference def generate_audio(text): output = pipe_dict["original_pipe"](text) output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Prediction from the original checkpoint {"kakao-enterprise/vits-ljs"}", show_label=True, visible=True) ###############language = "english" return output css = """ #container{ margin: 0 auto; max-width: 80rem; } #intro{ max-width: 100%; text-align: center; margin: 0 auto; } """ # Gradio blocks demo with gr.Blocks(css=css) as demo_blocks: with gr.Row(): with gr.Column(): inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?") btn = gr.Button("Generate Audio!") with gr.Column(): outputs = [] for i in range(max_speakers): out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False) outputs.append(out_audio) btn.click(generate_audio, [inp_text], outputs) demo_blocks.queue().launch()