NeuCoSVC-Colab / dataset /metadata.py
kevinwang676's picture
Upload folder using huggingface_hub
cfdc687
import os
import json
import random
import argparse
from joblib import Parallel, delayed
from tqdm import tqdm
from pathlib import Path
def GetMetaInfo(wav_path):
relative_path = wav_path.relative_to(data_root)
wavlm_path = (wavlm_dir/relative_path).with_suffix('.pt')
pitch_path = (pitch_dir/relative_path).with_suffix('.npy')
ld_path = (ld_dir/relative_path).with_suffix('.npy')
assert os.path.isfile(wavlm_path), f'{wavlm_path} does not exist.'
assert os.path.isfile(pitch_path), f'{pitch_path} does not exist.'
assert os.path.isfile(ld_path), f'{ld_path} does not exist.'
return [str(wav_path), str(wavlm_path), str(pitch_path), str(ld_path)]
def SplitDataset(wav_list:list[Path], train_valid_ratio=0.9, test_spk_list=['M26','M27','W46','W47']):
'''
Split the dataset into train set, valid set, and test set.
By default, it considers the OpenSinger dataset's 26th and 27th male singers (M26, M27) and
46th and 47th female singers (W46, W47) as the test set.
The remaining singers' audio files are randomly divided into the train set and the valid set in a 9:1 ratio.
Args:
wav_list (list[Path]): List of Path objects representing the paths to the wav files.
train_valid_ratio (float, optional): Ratio of the dataset to be used for training and validation. Defaults to 0.9.
test_spk_list (list[str], optional): List of speaker IDs to be included in the test set. Defaults to ['M26', 'M27', 'W46', 'W47'].
Returns:
Tuple[list[Path], list[Path], list[Path]]: Tuple containing the train set, valid set, and test set as lists of Path objects.
'''
train_list = []
valid_list = []
test_list = []
for wav_file in wav_list:
singer = wav_file.parent.parent.name[0] + wav_file.stem.split('_')[0]
if singer not in test_spk_list:
train_list.append(wav_file)
else:
test_list.append(wav_file)
random.shuffle(train_list)
train_valid_split = int(len(train_list) * train_valid_ratio)
train_list, valid_list = train_list[:train_valid_split], train_list[train_valid_split:]
return train_list, valid_list, test_list
def GenMetadata(data_root, wav_list, mode):
'''
generate the metadata file for the dataset
'''
results = Parallel(n_jobs=10)(delayed(GetMetaInfo)(wav_path) for wav_path in tqdm(wav_list))
with open(data_root/f'{mode}.json', 'w') as f:
json.dump(results, f)
return
def main(args):
global data_root, wavlm_dir, pitch_dir, ld_dir
data_root = Path(args.data_root)
wavlm_dir = Path(args.wavlm_dir) if args.wavlm_dir is not None else data_root/'wavlm_features'
pitch_dir = Path(args.pitch_dir) if args.pitch_dir is not None else data_root/'pitch'
ld_dir = Path(args.ld_dir) if args.ld_dir is not None else data_root/'loudness'
wav_list = list(data_root.rglob('*.wav'))
train_list, valid_list, test_list = SplitDataset(wav_list)
GenMetadata(data_root, train_list, 'train')
GenMetadata(data_root, valid_list, 'valid')
GenMetadata(data_root, test_list, 'test')
return
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--data_root',
required=True, type=str, help='Directory of audios for the dataset.'
)
parser.add_argument(
'--wavlm_dir',
type=str, help='Directory of wavlm features for the dataset.'
)
parser.add_argument(
'--pitch_dir',
type=str, help='Directory of pitch for the dataset.'
)
parser.add_argument(
'--ld_dir',
type=str, help='Directory of loudness for the dataset.'
)
args = parser.parse_args()
main(args)