Spaces:
Runtime error
Runtime error
from pathlib import Path | |
import torchaudio | |
import gradio as gr | |
import numpy as np | |
import torch | |
import json | |
from hifigan.config import v1 | |
from hifigan.denoiser import Denoiser | |
from hifigan.env import AttrDict | |
from hifigan.models import Generator as HiFiGAN | |
from pflow.models.pflow_tts import pflowTTS | |
from pflow.text import text_to_sequence, sequence_to_text | |
from pflow.utils.utils import intersperse | |
from pflow.data.text_mel_datamodule import mel_spectrogram | |
from pflow.utils.model import normalize | |
from vocos import Vocos | |
PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt' | |
#PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt' | |
VOCODER22_MODEL_PATH = 'BSC-LT/vocos-mel-22khz' | |
VOCODER44_MODEL_PATH = 'patriotyk/vocos-mel-hifigan-compat-44100khz' | |
HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000' | |
transform = torchaudio.transforms.Vol(gain=-32, gain_type="db") | |
wav, sr = torchaudio.load('prompt22050.wav') | |
prompt = mel_spectrogram( | |
transform(wav), | |
1024, | |
80, | |
22050, | |
256, | |
1024, | |
0, | |
8000, | |
center=False, | |
)[:,:,:264] | |
def process_text(text: str, device: torch.device): | |
x = torch.tensor( | |
intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0), | |
dtype=torch.long, | |
device=device, | |
)[None] | |
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) | |
x_phones = sequence_to_text(x.squeeze(0).tolist()) | |
return {"x_orig": text, "x": x, "x_lengths": x_lengths, 'x_phones':x_phones} | |
def load_hifigan(checkpoint_path, device): | |
h = AttrDict(v1) | |
hifigan = HiFiGAN(h).to(device) | |
hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"]) | |
_ = hifigan.eval() | |
hifigan.remove_weight_norm() | |
return hifigan | |
def load_vocos(checkpoint_path, config_path, device): | |
model = Vocos.from_hparams(config_path).to(device) | |
raw_model = torch.load(checkpoint_path, map_location=device) | |
raw_model = raw_model if 'state_dict' not in raw_model else raw_model['state_dict'] | |
model.load_state_dict(raw_model, strict=False) | |
model.eval() | |
return model | |
def to_waveform(mel, vocoder, denoiser=None): | |
return vocoder.decode(mel).clamp(-1, 1).cpu().squeeze() | |
# audio = vocoder(mel).clamp(-1, 1) | |
# if denoiser is not None: | |
# audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze() | |
# return audio.cpu().squeeze() | |
def get_device(): | |
if torch.cuda.is_available(): | |
print("[+] GPU Available! Using GPU") | |
device = torch.device("cuda") | |
else: | |
print("[-] GPU not available or forced CPU run! Using CPU") | |
device = torch.device("cpu") | |
return device | |
device = get_device() | |
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device) | |
_ = model.eval() | |
hifigan = load_hifigan(HIFIGAN_MODEL_PATH, device) | |
vocos_22050 = Vocos.from_pretrained(VOCODER22_MODEL_PATH, device_map=device) | |
#vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=209_step=3924480_val_loss=3.7036_44100_11.ckpt', 'vocos.yaml', device) | |
vocos_44100 = Vocos.from_pretrained(VOCODER44_MODEL_PATH, device_map=device) | |
denoiser = None#Denoiser(vocoder, mode="zeros") | |
def synthesise(text, speed): | |
if len(text) > 1000: | |
raise gr.Error("Текст повинен бути коротшим за 1000 символів.") | |
text_processed = process_text(text.strip(), device) | |
output = model.synthesise( | |
text_processed["x"].to(device), | |
text_processed["x_lengths"].to(device), | |
n_timesteps=40, | |
temperature=0.0, | |
length_scale=1/speed, | |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device), | |
guidance_scale=2.0 | |
) | |
waveform_vocos = vocos_22050.decode(output["mel"]).cpu().squeeze() | |
waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze() | |
waveform_hifigan = hifigan(output["mel"]).clamp(-1, 1).cpu().squeeze() | |
transform = torchaudio.transforms.Vol(gain=-18, gain_type="db") | |
return text_processed['x_phones'][1::2], (44100, waveform_vocos_44100.numpy()), (22050, waveform_vocos.numpy()), (22050, transform(waveform_hifigan).numpy()) | |
description = f''' | |
# Експериментальна апка для генерації аудіо з тексту. | |
pflow checkpoint {PFLOW_MODEL_PATH} | |
Vocos 44100 аудіо - {VOCODER44_MODEL_PATH} | |
Vocos 22050 аудіо - {VOCODER22_MODEL_PATH} | |
HIFIGAN 22050 аудіо - {HIFIGAN_MODEL_PATH} | |
''' | |
if __name__ == "__main__": | |
i = gr.Interface( | |
fn=synthesise, | |
description=description, | |
inputs=[ | |
gr.Text(label='Текст для синтезу:', lines=5, max_lines=10), | |
gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.0) | |
], | |
outputs=[ | |
gr.Text(label='Фонемізований текст:', lines=5), | |
gr.Audio( | |
label="Vocos 44100 аудіо:", | |
autoplay=False, | |
streaming=False, | |
type="numpy", | |
), | |
gr.Audio( | |
label="Vocos 22050 аудіо:", | |
autoplay=False, | |
streaming=False, | |
type="numpy", | |
), | |
gr.Audio( | |
label="HIFIGAN 22050 аудіо:", | |
autoplay=False, | |
streaming=False, | |
type="numpy", | |
) | |
], | |
allow_flagging ='manual', | |
#flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")], | |
cache_examples=True, | |
title='', | |
# description=description, | |
# article=article, | |
# examples=examples, | |
) | |
i.queue(max_size=20, default_concurrency_limit=4) | |
i.launch(share=False, server_name="0.0.0.0") | |