Spaces:
Runtime error
Runtime error
from pathlib import Path | |
import torchaudio | |
import gradio as gr | |
import numpy as np | |
import torch | |
from hifigan.config import v1 | |
from hifigan.denoiser import Denoiser | |
from hifigan.env import AttrDict | |
from hifigan.models import Generator as HiFiGAN | |
from pflow.models.pflow_tts import pflowTTS | |
from pflow.text import text_to_sequence, sequence_to_text | |
from pflow.utils.utils import intersperse | |
from pflow.data.text_mel_datamodule import mel_spectrogram | |
from pflow.utils.model import normalize | |
PFLOW_MODEL_PATH = 'checkpoint_epoch=649.ckpt' | |
VOCODER_MODEL_PATH = 'g_00140000_m' | |
transform = torchaudio.transforms.Vol(gain=-32, gain_type="db") | |
wav, sr = torchaudio.load('prompt.wav') | |
prompt = mel_spectrogram( | |
transform(wav), | |
1024, | |
80, | |
22050, | |
256, | |
1024, | |
0, | |
8000, | |
center=False, | |
)[:,:,:264] | |
def process_text(text: str, device: torch.device): | |
x = torch.tensor( | |
intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0), | |
dtype=torch.long, | |
device=device, | |
)[None] | |
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) | |
x_phones = sequence_to_text(x.squeeze(0).tolist()) | |
return {"x_orig": text, "x": x, "x_lengths": x_lengths, 'x_phones':x_phones} | |
def load_hifigan(checkpoint_path, device): | |
h = AttrDict(v1) | |
hifigan = HiFiGAN(h).to(device) | |
hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"]) | |
_ = hifigan.eval() | |
hifigan.remove_weight_norm() | |
return hifigan | |
def to_waveform(mel, vocoder, denoiser=None): | |
audio = vocoder(mel).clamp(-1, 1) | |
if denoiser is not None: | |
audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze() | |
return audio.cpu().squeeze() | |
def get_device(): | |
if torch.cuda.is_available(): | |
print("[+] GPU Available! Using GPU") | |
device = torch.device("cuda") | |
else: | |
print("[-] GPU not available or forced CPU run! Using CPU") | |
device = torch.device("cpu") | |
return device | |
device = get_device() | |
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device) | |
_ = model.eval() | |
vocoder = load_hifigan(VOCODER_MODEL_PATH, device) | |
denoiser = Denoiser(vocoder, mode="zeros") | |
def synthesise(text, temperature, speed): | |
if len(text) > 1000: | |
raise gr.Error("Текст повинен бути коротшим за 1000 символів.") | |
text_processed = process_text(text.strip(), device) | |
output = model.synthesise( | |
text_processed["x"].to(device), | |
text_processed["x_lengths"].to(device), | |
n_timesteps=40, | |
temperature=temperature, | |
length_scale=1/speed, | |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device), | |
guidance_scale=1.0 | |
) | |
waveform = to_waveform(output["mel"], vocoder, denoiser) | |
return text_processed['x_phones'][1::2], (22050, waveform.numpy()) | |
description = f''' | |
# Експериментальна апка для генерації аудіо з тексту. | |
pflow checkpoint {PFLOW_MODEL_PATH} | |
vocoder: HIFIGAN(трейнутий на датасеті, з нуля) - {VOCODER_MODEL_PATH} | |
''' | |
if __name__ == "__main__": | |
i = gr.Interface( | |
fn=synthesise, | |
description=description, | |
inputs=[ | |
gr.Text(label='Текст для синтезу:', lines=5, max_lines=10), | |
gr.Slider(minimum=0.0, maximum=1.0, label="Температура", value=0.4), | |
gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.0) | |
], | |
outputs=[ | |
gr.Text(label='Фонемізований текст:', lines=5), | |
gr.Audio( | |
label="Згенероване аудіо:", | |
autoplay=False, | |
streaming=False, | |
type="numpy", | |
) | |
], | |
allow_flagging ='manual', | |
flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")], | |
cache_examples=True, | |
title='', | |
# description=description, | |
# article=article, | |
# examples=examples, | |
) | |
i.queue(max_size=20, default_concurrency_limit=4) | |
i.launch(share=False, server_name="0.0.0.0") | |