Spaces:
Runtime error
Runtime error
''' | |
import gradio as gr | |
def greet(name): | |
return "Hello " + name + "!!" | |
iface = gr.Interface(fn=greet, inputs="text", outputs="text") | |
iface.launch() | |
''' | |
import gradio | |
import os | |
import shutil | |
import gradio as gr | |
import sys | |
import string | |
import time | |
import argparse | |
import json | |
import numpy as np | |
import torch | |
import librosa | |
import subprocess | |
from pydub import AudioSegment | |
from scipy.io.wavfile import write, read | |
from transformers import WavLMModel | |
from TTS.tts.utils.synthesis import synthesis | |
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols | |
try: | |
from TTS.utils.audio import AudioProcessor | |
except: | |
from TTS.utils.audio import AudioProcessor | |
from TTS.tts.models import setup_model | |
from TTS.config import load_config | |
from TTS.tts.models.vits import * | |
from TTS.tts.utils.speakers import SpeakerManager | |
import utils | |
from models import SynthesizerTrn | |
from mel_processing import mel_spectrogram_torch | |
from speaker_encoder.voice_encoder import SpeakerEncoder | |
TTS_PATH = "TTS/" | |
sys.path.append(TTS_PATH) # set this if TTS is not installed globally | |
OUT_PATH = 'out/' | |
os.makedirs(OUT_PATH, exist_ok=True) | |
TTS_SPEAKERS = "yourTTS_config/speakers.json" | |
USE_CUDA = torch.cuda.is_available() | |
device = torch.device("cuda" if USE_CUDA else "cpu") | |
CONFIG_PATH = 'yourTTS_config/config.json' | |
C = load_config(CONFIG_PATH) | |
ap = AudioProcessor(**C.audio) | |
speaker_embedding = None | |
C.model_args['d_vector_file'] = TTS_SPEAKERS | |
C.model_args['use_speaker_encoder_as_loss'] = False | |
model = setup_model(C) | |
TTS_LANGUAGES = "yourTTS_config/language_ids.json" | |
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) | |
# print(model.language_manager.num_languages, model.embedded_language_dim) | |
# print(model.emb_l) | |
MODEL_PATH = 'yourTTS_config/best_model.pth.tar' | |
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) | |
model_weights = cp['model'].copy() | |
for key in list(model_weights.keys()): | |
if "speaker_encoder" in key: | |
del model_weights[key] | |
model.load_state_dict(model_weights) | |
model.eval() | |
if USE_CUDA: | |
model = model.cuda() | |
use_griffin_lim = False | |
CONFIG_SE_PATH = "yourtts_config/config_se.json" | |
CHECKPOINT_SE_PATH = "yourtts_config/SE_checkpoint.pth.tar" | |
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) | |
def compute_spec(ref_file): | |
y, sr = librosa.load(ref_file, sr=ap.sample_rate) | |
spec = ap.spectrogram(y) | |
spec = torch.FloatTensor(spec).unsqueeze(0) | |
return spec | |
print("Loading FreeVC...") | |
hps = utils.get_hparams_from_file("configs/freevc.json") | |
freevc = SynthesizerTrn( | |
hps.data.filter_length // 2 + 1, | |
hps.train.segment_size // hps.data.hop_length, | |
**hps.model).to(device) | |
_ = freevc.eval() | |
_ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) | |
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') | |
print("Loading WavLM for content...") | |
cmodel = utils.get_cmodel(device).to(device) | |
# cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) | |
def voice_conversion_yourtts(da, ta): | |
# write(target_audio, ta[0], ta[1]) | |
# write(driving_audio, da[0], da[1]) | |
# !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f | |
# !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f | |
# !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f | |
files = [da, ta] | |
for file in files: | |
subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"]) | |
# ta_ = read(target_audio) | |
target_emb = SE_speaker_manager.compute_d_vector_from_clip([ta]) | |
target_emb = torch.FloatTensor(target_emb).unsqueeze(0) | |
driving_emb = SE_speaker_manager.compute_d_vector_from_clip([da]) | |
driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0) | |
# Convert the voice | |
driving_spec = compute_spec(da) | |
y_lengths = torch.tensor([driving_spec.size(-1)]) | |
if USE_CUDA: | |
ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda()) | |
ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy() | |
else: | |
ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb) | |
ref_wav_voc = ref_wav_voc.squeeze().detach().numpy() | |
# print("Reference Audio after decoder:") | |
# IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate)) | |
return (ap.sample_rate, ref_wav_voc) | |
def voice_conversion_freevc(src, tgt): | |
with torch.no_grad(): | |
wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate) | |
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) | |
g_tgt = smodel.embed_utterance(wav_tgt) | |
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) | |
wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate) | |
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) | |
# c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) | |
c = utils.get_content(cmodel, wav_src) | |
audio = freevc.infer(c, g=g_tgt) | |
audio = audio[0][0].data.cpu().float().numpy() | |
write("out.wav", hps.data.sampling_rate, audio) | |
out = "out.wav" | |
return out | |
model1 = gr.Dropdown(choices=["FreeVC", "YourTTS"], value="FreeVC",type="value", label="Model") | |
model2 = gr.Dropdown(choices=["FreeVC", "YourTTS"], value="FreeVC",type="value", label="Model") | |
audio1 = gr.inputs.Audio(label="Source Speaker - Input Audio", type='filepath') | |
audio2 = gr.inputs.Audio(label="Target Speaker - Input Audio", type='filepath') | |
microphone = gr.inputs.Audio(label="Source Speaker - Input Audio", source='microphone') | |
audio3 = gr.inputs.Audio(label="Target Speaker - Input Audio", type='filepath') | |
inputs_1 = [model1, audio1, audio2] | |
inputs_2 = [model2, microphone, audio3] | |
outputs_1 = gr.outputs.Audio(label="Target Speaker - Output Audio", type='filepath') | |
outputs_2 = gr.outputs.Audio(label="Target Speaker - Output Audio", type='filepath') | |
def voice_conversion(mod, sa, ta): | |
if mod=='FreeVC': | |
return voice_conversion_yourtts(sa, ta) | |
else: | |
return voice_conversion_freevc(sa, ta) | |
examples_1 = [['FreeVC', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav'], ['YourTTS', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav']] | |
vc_1 = gr.Interface( | |
fn=voice_conversion, | |
inputs=inputs_1, | |
outputs=outputs_1, | |
examples=examples_1, | |
description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the source speaker and the target speaker.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project" | |
) | |
vc_2 = gr.Interface( | |
fn=voice_conversion, | |
inputs=inputs_2, | |
outputs=outputs_2, | |
description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the target speaker and record the voice of the input speaker using the microphone.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project" | |
) | |
demo = gr.TabbedInterface([vc_1, vc_2], ["wav Input", "Microphone Input"], title="Voice Conversion") | |
demo.launch(debug='True') |