Spaces:

liuhaozhe6788
/

CelebChat

Runtime error

CelebChat / rtvc /synthesizer /preprocess.py

lhzstar

initial commits

6bc94ac about 1 year ago

19.5 kB

	from multiprocessing.pool import Pool
	from synthesizer import audio
	from functools import partial
	from itertools import chain, groupby
	from encoder import inference as encoder_infer
	from pathlib import Path
	from utils import logmmse
	from tqdm import tqdm
	import numpy as np
	import librosa
	import random


	def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
	datasets_name: str, subfolders: str, no_alignments=False):

	# Gather the input directories of LibriSpeeech
	dataset_root = datasets_root.joinpath(datasets_name)
	input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
	print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
	assert all(input_dir.exists() for input_dir in input_dirs)

	train_input_dirs = input_dirs[: -1]
	dev_input_dirs = input_dirs[-1: ]

	# Create the output directories for each output file type
	train_out_dir = out_dir.joinpath("train")
	train_out_dir.mkdir(exist_ok=True)
	train_out_dir.joinpath("mels").mkdir(exist_ok=True)
	train_out_dir.joinpath("audio").mkdir(exist_ok=True)

	# Create a metadata file
	train_metadata_fpath = train_out_dir.joinpath("train.txt")
	train_metadata_file = train_metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")

	dev_out_dir = out_dir.joinpath("dev")
	dev_out_dir.mkdir(exist_ok=True)
	dev_out_dir.joinpath("mels").mkdir(exist_ok=True)
	dev_out_dir.joinpath("audio").mkdir(exist_ok=True)

	# Create a metadata file
	dev_metadata_fpath = dev_out_dir.joinpath("dev.txt")
	dev_metadata_file = dev_metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")

	# Preprocess the train dataset
	train_speaker_dirs = list(chain.from_iterable(train_input_dir.glob("*") for train_input_dir in train_input_dirs))
	func = partial(preprocess_speaker, out_dir=train_out_dir, skip_existing=skip_existing,
	hparams=hparams, no_alignments=no_alignments)
	job = Pool(n_processes).imap(func, train_speaker_dirs)
	for speaker_metadata in tqdm(job, datasets_name, len(train_speaker_dirs), unit="speakers"):
	for metadatum in speaker_metadata:
	train_metadata_file.write("\|".join(str(x) for x in metadatum) + "\n")
	train_metadata_file.close()

	# Verify the contents of the metadata file
	with train_metadata_fpath.open("r", encoding="utf-8") as train_metadata_file:
	metadata = [line.split("\|") for line in train_metadata_file]
	mel_frames = sum([int(m[4]) for m in metadata])
	timesteps = sum([int(m[3]) for m in metadata])
	sample_rate = hparams.sample_rate
	hours = (timesteps / sample_rate) / 3600
	print("The train dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
	(len(metadata), mel_frames, timesteps, hours))
	print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
	print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
	print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))

	# Preprocess the dev dataset
	dev_speaker_dirs = list(chain.from_iterable(dev_input_dir.glob("*") for dev_input_dir in dev_input_dirs))
	func = partial(preprocess_speaker, out_dir=dev_out_dir, skip_existing=skip_existing,
	hparams=hparams, no_alignments=no_alignments)
	job = Pool(n_processes).imap(func, dev_speaker_dirs)
	for speaker_metadata in tqdm(job, datasets_name, len(dev_speaker_dirs), unit="speakers"):
	for metadatum in speaker_metadata:
	dev_metadata_file.write("\|".join(str(x) for x in metadatum) + "\n")
	dev_metadata_file.close()

	# Verify the contents of the metadata file
	with dev_metadata_fpath.open("r", encoding="utf-8") as dev_metadata_file:
	metadata = [line.split("\|") for line in dev_metadata_file]
	mel_frames = sum([int(m[4]) for m in metadata])
	timesteps = sum([int(m[3]) for m in metadata])
	sample_rate = hparams.sample_rate
	hours = (timesteps / sample_rate) / 3600
	print("The dev dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
	(len(metadata), mel_frames, timesteps, hours))
	print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
	print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
	print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))


	def preprocess_vctk(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
	datasets_name: str, subfolders: str, no_alignments=True):
	# TODO:Gather the input directories of VCTK
	dataset_root = datasets_root.joinpath(datasets_name)
	input_dir = dataset_root.joinpath(subfolders)
	print("Using data from:" + str(input_dir))
	assert input_dir.exists()
	paths = [input_dir.rglob(".flac")]

	# train dev audio data split
	train_input_fpaths = []
	dev_input_fpaths = []

	pairs = sorted([(p.parts[-2].split('_')[0], p) for p in paths])
	del paths

	for _, group in groupby(pairs, lambda pair: pair[0]):
	paths = sorted([p for _, p in group if "mic1.flac" in str(p)]) # only get mic1 flac file
	random.seed(0)
	random.shuffle(paths)
	n = round(len(paths) * 0.9)
	train_input_fpaths.extend(paths[:n])
	# dev dataset has the same speakers as train dataset
	dev_input_fpaths.extend(paths[n:])

	# Create the output directories for each output file type
	train_out_dir = out_dir.joinpath("train")
	train_out_dir.mkdir(exist_ok=True)
	train_out_dir.joinpath("mels").mkdir(exist_ok=True)
	train_out_dir.joinpath("audio").mkdir(exist_ok=True)

	dev_out_dir = out_dir.joinpath("dev")
	dev_out_dir.mkdir(exist_ok=True)
	dev_out_dir.joinpath("mels").mkdir(exist_ok=True)
	dev_out_dir.joinpath("audio").mkdir(exist_ok=True)

	# Preprocess the train dataset
	preprocess_data(train_input_fpaths, mode="train", out_dir=train_out_dir, skip_existing=skip_existing, hparams=hparams, no_alignments=no_alignments)

	# Preprocess the dev dataset
	preprocess_data(dev_input_fpaths, mode="dev", out_dir=dev_out_dir, skip_existing=skip_existing, hparams=hparams, no_alignments=no_alignments)


	def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
	metadata = []
	for book_dir in speaker_dir.glob("*"):
	if no_alignments:
	# Gather the utterance audios and texts
	# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
	extensions = [".wav", ".flac", "*.mp3"]
	for extension in extensions:
	wav_fpaths = book_dir.glob(extension)

	for wav_fpath in wav_fpaths:
	# Load the audio waveform
	wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
	if hparams.rescale:
	wav = wav / np.abs(wav).max() * hparams.rescaling_max

	# Get the corresponding text
	# Check for .txt (for compatibility with other datasets)
	text_fpath = wav_fpath.with_suffix(".txt")
	if not text_fpath.exists():
	# Check for .normalized.txt (LibriTTS)
	text_fpath = wav_fpath.with_suffix(".normalized.txt")
	assert text_fpath.exists()
	with text_fpath.open("r") as text_file:
	text = "".join([line for line in text_file])
	text = text.replace("\"", "")
	text = text.strip()

	# Process the utterance
	metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
	skip_existing, hparams))
	else:
	# Process alignment file (LibriSpeech support)
	# Gather the utterance audios and texts
	try:
	alignments_fpath = next(book_dir.glob("*.alignment.txt"))
	with alignments_fpath.open("r") as alignments_file:
	alignments = [line.rstrip().split(" ") for line in alignments_file]
	except StopIteration:
	# A few alignment files will be missing
	continue

	# Iterate over each entry in the alignments file
	for wav_fname, words, end_times in alignments:
	wav_fpath = book_dir.joinpath(wav_fname + ".flac")
	assert wav_fpath.exists()
	words = words.replace("\"", "").split(",")
	end_times = list(map(float, end_times.replace("\"", "").split(",")))

	# Process each sub-utterance
	wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
	for i, (wav, text) in enumerate(zip(wavs, texts)):
	sub_basename = "%s_%02d" % (wav_fname, i)
	metadata.append(process_utterance(wav, text, out_dir, sub_basename,
	skip_existing, hparams))

	return [m for m in metadata if m is not None]


	def preprocess_data(wav_fpaths, mode, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
	assert mode in ["train", "dev"]
	# Create a metadata file
	metadata_fpath = out_dir.joinpath(f"{mode}.txt")
	metadata_file = metadata_fpath.open("a", encoding="utf-8")
	if no_alignments:
	for wav_fpath in tqdm(wav_fpaths, desc=mode):
	# Load the audio waveform
	wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
	if hparams.rescale:
	wav = wav / np.abs(wav).max() * hparams.rescaling_max

	# Get the corresponding text
	# Check for .txt (for compatibility with other datasets)
	base_name = "_".join(wav_fpath.name.split(".")[0].split("_")[: -1]) + ".txt"
	text_fpath = wav_fpath.with_name(base_name)

	if not text_fpath.exists():
	continue
	with text_fpath.open("r") as text_file:
	text = "".join([line for line in text_file])
	text = text.replace("\"", "")
	text = text.strip()

	# Process the utterance
	metadata = process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), skip_existing, hparams, trim_silence=False)

	if metadata is not None:
	metadata_file.write("\|".join(str(x) for x in metadata) + "\n")
	metadata_file.close()

	# Verify the contents of the metadata file
	with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
	metadata = [line.split("\|") for line in metadata_file]
	mel_frames = sum([int(m[4]) for m in metadata])
	timesteps = sum([int(m[3]) for m in metadata])
	sample_rate = hparams.sample_rate
	hours = (timesteps / sample_rate) / 3600
	print(f"The {mode} dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
	(len(metadata), mel_frames, timesteps, hours))
	print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
	print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
	print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))


	def split_on_silences(wav_fpath, words, end_times, hparams):
	# Load the audio waveform
	wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
	if hparams.rescale:
	wav = wav / np.abs(wav).max() * hparams.rescaling_max

	words = np.array(words)
	start_times = np.array([0.0] + end_times[:-1])
	end_times = np.array(end_times)
	assert len(words) == len(end_times) == len(start_times)
	assert words[0] == "" and words[-1] == ""

	# Find pauses that are too long
	mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
	mask[0] = mask[-1] = True
	breaks = np.where(mask)[0]

	# Profile the noise from the silences and perform noise reduction on the waveform
	silence_times = [[start_times[i], end_times[i]] for i in breaks]
	silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
	noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
	if len(noisy_wav) > hparams.sample_rate * 0.02:
	profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
	wav = logmmse.denoise(wav, profile, eta=0)

	# Re-attach segments that are too short
	segments = list(zip(breaks[:-1], breaks[1:]))
	segment_durations = [start_times[end] - end_times[start] for start, end in segments]
	i = 0
	while i < len(segments) and len(segments) > 1:
	if segment_durations[i] < hparams.utterance_min_duration:
	# See if the segment can be re-attached with the right or the left segment
	left_duration = float("inf") if i == 0 else segment_durations[i - 1]
	right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
	joined_duration = segment_durations[i] + min(left_duration, right_duration)

	# Do not re-attach if it causes the joined utterance to be too long
	if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
	i += 1
	continue

	# Re-attach the segment with the neighbour of shortest duration
	j = i - 1 if left_duration <= right_duration else i
	segments[j] = (segments[j][0], segments[j + 1][1])
	segment_durations[j] = joined_duration
	del segments[j + 1], segment_durations[j + 1]
	else:
	i += 1

	# Split the utterance
	segment_times = [[end_times[start], start_times[end]] for start, end in segments]
	segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
	wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
	texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]

	# # DEBUG: play the audio segments (run with -n=1)
	# import sounddevice as sd
	# if len(wavs) > 1:
	# print("This sentence was split in %d segments:" % len(wavs))
	# else:
	# print("There are no silences long enough for this sentence to be split:")
	# for wav, text in zip(wavs, texts):
	# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
	# # when playing them. You shouldn't need to do that in your parsers.
	# wav = np.concatenate((wav, [0] * 16000))
	# print("\t%s" % text)
	# sd.play(wav, 16000, blocking=True)
	# print("")

	return wavs, texts


	def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
	skip_existing: bool, hparams, trim_silence=True):
	## FOR REFERENCE:
	# For you not to lose your head if you ever wish to change things here or implement your own
	# synthesizer.
	# - Both the audios and the mel spectrograms are saved as numpy arrays
	# - There is no processing done to the audios that will be saved to disk beyond volume
	# normalization (in split_on_silences)
	# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
	# is why we re-apply it on the audio on the side of the vocoder.
	# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
	# without extra padding. This means that you won't have an exact relation between the length
	# of the wav and of the mel spectrogram. See the vocoder data loader.


	# Skip existing utterances if needed
	mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
	wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
	if skip_existing and mel_fpath.exists() and wav_fpath.exists():
	return None

	# Trim silence
	wav = encoder_infer.preprocess_wav(wav, normalize=False, trim_silence=trim_silence)

	# Skip utterances that are too short
	if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
	return None

	# Compute the mel spectrogram
	mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
	mel_frames = mel_spectrogram.shape[1]

	# Skip utterances that are too long
	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
	return None

	# Write the spectrogram, embed and audio to disk
	np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
	np.save(wav_fpath, wav, allow_pickle=False)

	# Return a tuple describing this training example
	return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text


	def embed_utterance(fpaths, encoder_model_fpath):
	if not encoder_infer.is_loaded():
	encoder_infer.load_model(encoder_model_fpath)

	# Compute the speaker embedding of the utterance
	wav_fpath, embed_fpath = fpaths
	wav = np.load(wav_fpath)
	wav = encoder_infer.preprocess_wav(wav)
	embed = encoder_infer.embed_utterance(wav)
	np.save(embed_fpath, embed, allow_pickle=False)


	def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
	# create train embeddings
	train_wav_dir = synthesizer_root.joinpath("train/audio")
	train_metadata_fpath = synthesizer_root.joinpath("train/train.txt")
	assert train_wav_dir.exists() and train_metadata_fpath.exists()
	train_embed_dir = synthesizer_root.joinpath("train/embeds")
	train_embed_dir.mkdir(exist_ok=True)

	# Gather the input wave filepath and the target output embed filepath
	with train_metadata_fpath.open("r") as metadata_file:
	metadata = [line.split("\|") for line in metadata_file]
	fpaths = [(train_wav_dir.joinpath(m[0]), train_embed_dir.joinpath(m[2])) for m in metadata]

	# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
	# Embed the utterances in separate threads
	func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
	job = Pool(n_processes).imap(func, fpaths)
	list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

	# create dev embeddings
	dev_wav_dir = synthesizer_root.joinpath("dev/audio")
	dev_metadata_fpath = synthesizer_root.joinpath("dev/dev.txt")
	assert dev_wav_dir.exists() and dev_metadata_fpath.exists()
	dev_embed_dir = synthesizer_root.joinpath("dev/embeds")
	dev_embed_dir.mkdir(exist_ok=True)

	# Gather the input wave filepath and the target output embed filepath
	with dev_metadata_fpath.open("r") as metadata_file:
	metadata = [line.split("\|") for line in metadata_file]
	fpaths = [(dev_wav_dir.joinpath(m[0]), dev_embed_dir.joinpath(m[2])) for m in metadata]

	# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
	# Embed the utterances in separate threads
	func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
	job = Pool(n_processes).imap(func, fpaths)
	list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))