SoniTranslate_translate_audio_of_a_video_content

Running

App Files Files Community

SoniTranslate_translate_audio_of_a_video_content / voice_main.py

r3gm

v0.5.0

b152010 6 months ago

raw

history blame contribute delete

21.7 kB

	from soni_translate.logging_setup import logger
	import torch
	import gc
	import numpy as np
	import os
	import shutil
	import warnings
	import threading
	from tqdm import tqdm
	from lib.infer_pack.models import (
	SynthesizerTrnMs256NSFsid,
	SynthesizerTrnMs256NSFsid_nono,
	SynthesizerTrnMs768NSFsid,
	SynthesizerTrnMs768NSFsid_nono,
	)
	from lib.audio import load_audio
	import soundfile as sf
	import edge_tts
	import asyncio
	from soni_translate.utils import remove_directory_contents, create_directories
	from scipy import signal
	from time import time as ttime
	import faiss
	from vci_pipeline import VC, change_rms, bh, ah
	import librosa

	warnings.filterwarnings("ignore")


	class Config:
	def __init__(self, only_cpu=False):
	self.device = "cuda:0"
	self.is_half = True
	self.n_cpu = 0
	self.gpu_name = None
	self.gpu_mem = None
	(
	self.x_pad,
	self.x_query,
	self.x_center,
	self.x_max
	) = self.device_config(only_cpu)

	def device_config(self, only_cpu) -> tuple:
	if torch.cuda.is_available() and not only_cpu:
	i_device = int(self.device.split(":")[-1])
	self.gpu_name = torch.cuda.get_device_name(i_device)
	if (
	("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
	or "P40" in self.gpu_name.upper()
	or "1060" in self.gpu_name
	or "1070" in self.gpu_name
	or "1080" in self.gpu_name
	):
	logger.info(
	"16/10 Series GPUs and P40 excel "
	"in single-precision tasks."
	)
	self.is_half = False
	else:
	self.gpu_name = None
	self.gpu_mem = int(
	torch.cuda.get_device_properties(i_device).total_memory
	/ 1024
	/ 1024
	/ 1024
	+ 0.4
	)
	elif torch.backends.mps.is_available() and not only_cpu:
	logger.info("Supported N-card not found, using MPS for inference")
	self.device = "mps"
	else:
	logger.info("No supported N-card found, using CPU for inference")
	self.device = "cpu"
	self.is_half = False

	if self.n_cpu == 0:
	self.n_cpu = os.cpu_count()

	if self.is_half:
	# 6GB VRAM configuration
	x_pad = 3
	x_query = 10
	x_center = 60
	x_max = 65
	else:
	# 5GB VRAM configuration
	x_pad = 1
	x_query = 6
	x_center = 38
	x_max = 41

	if self.gpu_mem is not None and self.gpu_mem <= 4:
	x_pad = 1
	x_query = 5
	x_center = 30
	x_max = 32

	logger.info(
	f"Config: Device is {self.device}, "
	f"half precision is {self.is_half}"
	)

	return x_pad, x_query, x_center, x_max


	BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/"
	BASE_MODELS = [
	"hubert_base.pt",
	"rmvpe.pt"
	]
	BASE_DIR = "."


	def load_hu_bert(config):
	from fairseq import checkpoint_utils
	from soni_translate.utils import download_manager

	for id_model in BASE_MODELS:
	download_manager(
	os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR
	)

	models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
	["hubert_base.pt"],
	suffix="",
	)
	hubert_model = models[0]
	hubert_model = hubert_model.to(config.device)
	if config.is_half:
	hubert_model = hubert_model.half()
	else:
	hubert_model = hubert_model.float()
	hubert_model.eval()

	return hubert_model


	def load_trained_model(model_path, config):

	if not model_path:
	raise ValueError("No model found")

	logger.info("Loading %s" % model_path)
	cpt = torch.load(model_path, map_location="cpu")
	tgt_sr = cpt["config"][-1]
	cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
	if_f0 = cpt.get("f0", 1)
	if if_f0 == 0:
	# protect to 0.5 need?
	pass

	version = cpt.get("version", "v1")
	if version == "v1":
	if if_f0 == 1:
	net_g = SynthesizerTrnMs256NSFsid(
	*cpt["config"], is_half=config.is_half
	)
	else:
	net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
	elif version == "v2":
	if if_f0 == 1:
	net_g = SynthesizerTrnMs768NSFsid(
	*cpt["config"], is_half=config.is_half
	)
	else:
	net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
	del net_g.enc_q

	net_g.load_state_dict(cpt["weight"], strict=False)
	net_g.eval().to(config.device)

	if config.is_half:
	net_g = net_g.half()
	else:
	net_g = net_g.float()

	vc = VC(tgt_sr, config)
	n_spk = cpt["config"][-3]

	return n_spk, tgt_sr, net_g, vc, cpt, version


	class ClassVoices:
	def __init__(self, only_cpu=False):
	self.model_config = {}
	self.config = None
	self.only_cpu = only_cpu

	def apply_conf(
	self,
	tag="base_model",
	file_model="",
	pitch_algo="pm",
	pitch_lvl=0,
	file_index="",
	index_influence=0.66,
	respiration_median_filtering=3,
	envelope_ratio=0.25,
	consonant_breath_protection=0.33,
	resample_sr=0,
	file_pitch_algo="",
	):

	if not file_model:
	raise ValueError("Model not found")

	if file_index is None:
	file_index = ""

	if file_pitch_algo is None:
	file_pitch_algo = ""

	if not self.config:
	self.config = Config(self.only_cpu)
	self.hu_bert_model = None
	self.model_pitch_estimator = None

	self.model_config[tag] = {
	"file_model": file_model,
	"pitch_algo": pitch_algo,
	"pitch_lvl": pitch_lvl, # no decimal
	"file_index": file_index,
	"index_influence": index_influence,
	"respiration_median_filtering": respiration_median_filtering,
	"envelope_ratio": envelope_ratio,
	"consonant_breath_protection": consonant_breath_protection,
	"resample_sr": resample_sr,
	"file_pitch_algo": file_pitch_algo,
	}
	return f"CONFIGURATION APPLIED FOR {tag}: {file_model}"

	def infer(
	self,
	task_id,
	params,
	# load model
	n_spk,
	tgt_sr,
	net_g,
	pipe,
	cpt,
	version,
	if_f0,
	# load index
	index_rate,
	index,
	big_npy,
	# load f0 file
	inp_f0,
	# audio file
	input_audio_path,
	overwrite,
	):

	f0_method = params["pitch_algo"]
	f0_up_key = params["pitch_lvl"]
	filter_radius = params["respiration_median_filtering"]
	resample_sr = params["resample_sr"]
	rms_mix_rate = params["envelope_ratio"]
	protect = params["consonant_breath_protection"]

	if not os.path.exists(input_audio_path):
	raise ValueError(
	"The audio file was not found or is not "
	f"a valid file: {input_audio_path}"
	)

	f0_up_key = int(f0_up_key)

	audio = load_audio(input_audio_path, 16000)

	# Normalize audio
	audio_max = np.abs(audio).max() / 0.95
	if audio_max > 1:
	audio /= audio_max

	times = [0, 0, 0]

	# filters audio signal, pads it, computes sliding window sums,
	# and extracts optimized time indices
	audio = signal.filtfilt(bh, ah, audio)
	audio_pad = np.pad(
	audio, (pipe.window // 2, pipe.window // 2), mode="reflect"
	)
	opt_ts = []
	if audio_pad.shape[0] > pipe.t_max:
	audio_sum = np.zeros_like(audio)
	for i in range(pipe.window):
	audio_sum += audio_pad[i:i - pipe.window]
	for t in range(pipe.t_center, audio.shape[0], pipe.t_center):
	opt_ts.append(
	t
	- pipe.t_query
	+ np.where(
	np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query])
	== np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min()
	)[0][0]
	)

	s = 0
	audio_opt = []
	t = None
	t1 = ttime()

	sid_value = 0
	sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long()

	# Pads audio symmetrically, calculates length divided by window size.
	audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect")
	p_len = audio_pad.shape[0] // pipe.window

	# Estimates pitch from audio signal
	pitch, pitchf = None, None
	if if_f0 == 1:
	pitch, pitchf = pipe.get_f0(
	input_audio_path,
	audio_pad,
	p_len,
	f0_up_key,
	f0_method,
	filter_radius,
	inp_f0,
	)
	pitch = pitch[:p_len]
	pitchf = pitchf[:p_len]
	if pipe.device == "mps":
	pitchf = pitchf.astype(np.float32)
	pitch = torch.tensor(
	pitch, device=pipe.device
	).unsqueeze(0).long()
	pitchf = torch.tensor(
	pitchf, device=pipe.device
	).unsqueeze(0).float()

	t2 = ttime()
	times[1] += t2 - t1
	for t in opt_ts:
	t = t // pipe.window * pipe.window
	if if_f0 == 1:
	pitch_slice = pitch[
	:, s // pipe.window: (t + pipe.t_pad2) // pipe.window
	]
	pitchf_slice = pitchf[
	:, s // pipe.window: (t + pipe.t_pad2) // pipe.window
	]
	else:
	pitch_slice = None
	pitchf_slice = None

	audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window]
	audio_opt.append(
	pipe.vc(
	self.hu_bert_model,
	net_g,
	sid,
	audio_slice,
	pitch_slice,
	pitchf_slice,
	times,
	index,
	big_npy,
	index_rate,
	version,
	protect,
	)[pipe.t_pad_tgt:-pipe.t_pad_tgt]
	)
	s = t

	pitch_end_slice = pitch[
	:, t // pipe.window:
	] if t is not None else pitch
	pitchf_end_slice = pitchf[
	:, t // pipe.window:
	] if t is not None else pitchf

	audio_opt.append(
	pipe.vc(
	self.hu_bert_model,
	net_g,
	sid,
	audio_pad[t:],
	pitch_end_slice,
	pitchf_end_slice,
	times,
	index,
	big_npy,
	index_rate,
	version,
	protect,
	)[pipe.t_pad_tgt:-pipe.t_pad_tgt]
	)

	audio_opt = np.concatenate(audio_opt)
	if rms_mix_rate != 1:
	audio_opt = change_rms(
	audio, 16000, audio_opt, tgt_sr, rms_mix_rate
	)
	if resample_sr >= 16000 and tgt_sr != resample_sr:
	audio_opt = librosa.resample(
	audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
	)
	audio_max = np.abs(audio_opt).max() / 0.99
	max_int16 = 32768
	if audio_max > 1:
	max_int16 /= audio_max
	audio_opt = (audio_opt * max_int16).astype(np.int16)
	del pitch, pitchf, sid
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	if tgt_sr != resample_sr >= 16000:
	final_sr = resample_sr
	else:
	final_sr = tgt_sr

	"""
	"Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
	times[0],
	times[1],
	times[2],
	), (final_sr, audio_opt)

	"""

	if overwrite:
	output_audio_path = input_audio_path # Overwrite
	else:
	basename = os.path.basename(input_audio_path)
	dirname = os.path.dirname(input_audio_path)

	new_basename = basename.split(
	'.')[0] + "_edited." + basename.split('.')[-1]
	new_path = os.path.join(dirname, new_basename)
	logger.info(str(new_path))

	output_audio_path = new_path

	# Save file
	sf.write(
	file=output_audio_path,
	samplerate=final_sr,
	data=audio_opt
	)

	self.model_config[task_id]["result"].append(output_audio_path)
	self.output_list.append(output_audio_path)

	def make_test(
	self,
	tts_text,
	tts_voice,
	model_path,
	index_path,
	transpose,
	f0_method,
	):

	folder_test = "test"
	tag = "test_edge"
	tts_file = "test/test.wav"
	tts_edited = "test/test_edited.wav"

	create_directories(folder_test)
	remove_directory_contents(folder_test)

	if "SET_LIMIT" == os.getenv("DEMO"):
	if len(tts_text) > 60:
	tts_text = tts_text[:60]
	logger.warning("DEMO; limit to 60 characters")

	try:
	asyncio.run(edge_tts.Communicate(
	tts_text, "-".join(tts_voice.split('-')[:-1])
	).save(tts_file))
	except Exception as e:
	raise ValueError(
	"No audio was received. Please change the "
	f"tts voice for {tts_voice}. Error: {str(e)}"
	)

	shutil.copy(tts_file, tts_edited)

	self.apply_conf(
	tag=tag,
	file_model=model_path,
	pitch_algo=f0_method,
	pitch_lvl=transpose,
	file_index=index_path,
	index_influence=0.66,
	respiration_median_filtering=3,
	envelope_ratio=0.25,
	consonant_breath_protection=0.33,
	)

	self(
	audio_files=tts_edited,
	tag_list=tag,
	overwrite=True
	)

	return tts_edited, tts_file

	def run_threads(self, threads):
	# Start threads
	for thread in threads:
	thread.start()

	# Wait for all threads to finish
	for thread in threads:
	thread.join()

	gc.collect()
	torch.cuda.empty_cache()

	def unload_models(self):
	self.hu_bert_model = None
	self.model_pitch_estimator = None
	gc.collect()
	torch.cuda.empty_cache()

	def __call__(
	self,
	audio_files=[],
	tag_list=[],
	overwrite=False,
	parallel_workers=1,
	):
	logger.info(f"Parallel workers: {str(parallel_workers)}")

	self.output_list = []

	if not self.model_config:
	raise ValueError("No model has been configured for inference")

	if isinstance(audio_files, str):
	audio_files = [audio_files]
	if isinstance(tag_list, str):
	tag_list = [tag_list]

	if not audio_files:
	raise ValueError("No audio found to convert")
	if not tag_list:
	tag_list = [list(self.model_config.keys())[-1]] * len(audio_files)

	if len(audio_files) > len(tag_list):
	logger.info("Extend tag list to match audio files")
	extend_number = len(audio_files) - len(tag_list)
	tag_list.extend([tag_list[0]] * extend_number)

	if len(audio_files) < len(tag_list):
	logger.info("Cut list tags")
	tag_list = tag_list[:len(audio_files)]

	tag_file_pairs = list(zip(tag_list, audio_files))
	sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0])

	# Base params
	if not self.hu_bert_model:
	self.hu_bert_model = load_hu_bert(self.config)

	cache_params = None
	threads = []
	progress_bar = tqdm(total=len(tag_list), desc="Progress")
	for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file):

	if id_tag not in self.model_config.keys():
	logger.info(
	f"No configured model for {id_tag} with {input_audio_path}"
	)
	continue

	if (
	len(threads) >= parallel_workers
	or cache_params != id_tag
	and cache_params is not None
	):

	self.run_threads(threads)
	progress_bar.update(len(threads))

	threads = []

	if cache_params != id_tag:

	self.model_config[id_tag]["result"] = []

	# Unload previous
	(
	n_spk,
	tgt_sr,
	net_g,
	pipe,
	cpt,
	version,
	if_f0,
	index_rate,
	index,
	big_npy,
	inp_f0,
	) = [None] * 11
	gc.collect()
	torch.cuda.empty_cache()

	# Model params
	params = self.model_config[id_tag]

	model_path = params["file_model"]
	f0_method = params["pitch_algo"]
	file_index = params["file_index"]
	index_rate = params["index_influence"]
	f0_file = params["file_pitch_algo"]

	# Load model
	(
	n_spk,
	tgt_sr,
	net_g,
	pipe,
	cpt,
	version
	) = load_trained_model(model_path, self.config)
	if_f0 = cpt.get("f0", 1) # pitch data

	# Load index
	if os.path.exists(file_index) and index_rate != 0:
	try:
	index = faiss.read_index(file_index)
	big_npy = index.reconstruct_n(0, index.ntotal)
	except Exception as error:
	logger.error(f"Index: {str(error)}")
	index_rate = 0
	index = big_npy = None
	else:
	logger.warning("File index not found")
	index_rate = 0
	index = big_npy = None

	# Load f0 file
	inp_f0 = None
	if os.path.exists(f0_file):
	try:
	with open(f0_file, "r") as f:
	lines = f.read().strip("\n").split("\n")
	inp_f0 = []
	for line in lines:
	inp_f0.append([float(i) for i in line.split(",")])
	inp_f0 = np.array(inp_f0, dtype="float32")
	except Exception as error:
	logger.error(f"f0 file: {str(error)}")

	if "rmvpe" in f0_method:
	if not self.model_pitch_estimator:
	from lib.rmvpe import RMVPE

	logger.info("Loading vocal pitch estimator model")
	self.model_pitch_estimator = RMVPE(
	"rmvpe.pt",
	is_half=self.config.is_half,
	device=self.config.device
	)

	pipe.model_rmvpe = self.model_pitch_estimator

	cache_params = id_tag

	# self.infer(
	# id_tag,
	# params,
	# # load model
	# n_spk,
	# tgt_sr,
	# net_g,
	# pipe,
	# cpt,
	# version,
	# if_f0,
	# # load index
	# index_rate,
	# index,
	# big_npy,
	# # load f0 file
	# inp_f0,
	# # output file
	# input_audio_path,
	# overwrite,
	# )

	thread = threading.Thread(
	target=self.infer,
	args=(
	id_tag,
	params,
	# loaded model
	n_spk,
	tgt_sr,
	net_g,
	pipe,
	cpt,
	version,
	if_f0,
	# loaded index
	index_rate,
	index,
	big_npy,
	# loaded f0 file
	inp_f0,
	# audio file
	input_audio_path,
	overwrite,
	)
	)

	threads.append(thread)

	# Run last
	if threads:
	self.run_threads(threads)

	progress_bar.update(len(threads))
	progress_bar.close()

	final_result = []
	valid_tags = set(tag_list)
	for tag in valid_tags:
	if (
	tag in self.model_config.keys()
	and "result" in self.model_config[tag].keys()
	):
	final_result.extend(self.model_config[tag]["result"])

	return final_result