import os import json import random import argparse from joblib import Parallel, delayed from tqdm import tqdm from pathlib import Path def GetMetaInfo(wav_path): relative_path = wav_path.relative_to(data_root) wavlm_path = (wavlm_dir/relative_path).with_suffix('.pt') pitch_path = (pitch_dir/relative_path).with_suffix('.npy') ld_path = (ld_dir/relative_path).with_suffix('.npy') assert os.path.isfile(wavlm_path), f'{wavlm_path} does not exist.' assert os.path.isfile(pitch_path), f'{pitch_path} does not exist.' assert os.path.isfile(ld_path), f'{ld_path} does not exist.' return [str(wav_path), str(wavlm_path), str(pitch_path), str(ld_path)] def SplitDataset(wav_list:list[Path], train_valid_ratio=0.9, test_spk_list=['M26','M27','W46','W47']): ''' Split the dataset into train set, valid set, and test set. By default, it considers the OpenSinger dataset's 26th and 27th male singers (M26, M27) and 46th and 47th female singers (W46, W47) as the test set. The remaining singers' audio files are randomly divided into the train set and the valid set in a 9:1 ratio. Args: wav_list (list[Path]): List of Path objects representing the paths to the wav files. train_valid_ratio (float, optional): Ratio of the dataset to be used for training and validation. Defaults to 0.9. test_spk_list (list[str], optional): List of speaker IDs to be included in the test set. Defaults to ['M26', 'M27', 'W46', 'W47']. Returns: Tuple[list[Path], list[Path], list[Path]]: Tuple containing the train set, valid set, and test set as lists of Path objects. ''' train_list = [] valid_list = [] test_list = [] for wav_file in wav_list: singer = wav_file.parent.parent.name[0] + wav_file.stem.split('_')[0] if singer not in test_spk_list: train_list.append(wav_file) else: test_list.append(wav_file) random.shuffle(train_list) train_valid_split = int(len(train_list) * train_valid_ratio) train_list, valid_list = train_list[:train_valid_split], train_list[train_valid_split:] return train_list, valid_list, test_list def GenMetadata(data_root, wav_list, mode): ''' generate the metadata file for the dataset ''' results = Parallel(n_jobs=10)(delayed(GetMetaInfo)(wav_path) for wav_path in tqdm(wav_list)) with open(data_root/f'{mode}.json', 'w') as f: json.dump(results, f) return def main(args): global data_root, wavlm_dir, pitch_dir, ld_dir data_root = Path(args.data_root) wavlm_dir = Path(args.wavlm_dir) if args.wavlm_dir is not None else data_root/'wavlm_features' pitch_dir = Path(args.pitch_dir) if args.pitch_dir is not None else data_root/'pitch' ld_dir = Path(args.ld_dir) if args.ld_dir is not None else data_root/'loudness' wav_list = list(data_root.rglob('*.wav')) train_list, valid_list, test_list = SplitDataset(wav_list) GenMetadata(data_root, train_list, 'train') GenMetadata(data_root, valid_list, 'valid') GenMetadata(data_root, test_list, 'test') return if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '--data_root', required=True, type=str, help='Directory of audios for the dataset.' ) parser.add_argument( '--wavlm_dir', type=str, help='Directory of wavlm features for the dataset.' ) parser.add_argument( '--pitch_dir', type=str, help='Directory of pitch for the dataset.' ) parser.add_argument( '--ld_dir', type=str, help='Directory of loudness for the dataset.' ) args = parser.parse_args() main(args)