NeuCoSVC-Colab

Running

File size: 3,746 Bytes

cfdc687

import os
import json
import random
import argparse
from joblib import Parallel, delayed
from tqdm import tqdm
from pathlib import Path


def GetMetaInfo(wav_path):
    relative_path = wav_path.relative_to(data_root)
    wavlm_path = (wavlm_dir/relative_path).with_suffix('.pt')
    pitch_path = (pitch_dir/relative_path).with_suffix('.npy')
    ld_path = (ld_dir/relative_path).with_suffix('.npy')
    assert os.path.isfile(wavlm_path), f'{wavlm_path} does not exist.'
    assert os.path.isfile(pitch_path), f'{pitch_path} does not exist.'
    assert os.path.isfile(ld_path), f'{ld_path} does not exist.'

    return [str(wav_path), str(wavlm_path), str(pitch_path), str(ld_path)]


def SplitDataset(wav_list:list[Path], train_valid_ratio=0.9, test_spk_list=['M26','M27','W46','W47']):
    '''
    Split the dataset into train set, valid set, and test set. 
    By default, it considers the OpenSinger dataset's 26th and 27th male singers (M26, M27) and 
    46th and 47th female singers (W46, W47) as the test set. 
    The remaining singers' audio files are randomly divided into the train set and the valid set in a 9:1 ratio.

    Args:
        wav_list (list[Path]): List of Path objects representing the paths to the wav files.
        train_valid_ratio (float, optional): Ratio of the dataset to be used for training and validation. Defaults to 0.9.
        test_spk_list (list[str], optional): List of speaker IDs to be included in the test set. Defaults to ['M26', 'M27', 'W46', 'W47'].

    Returns:
        Tuple[list[Path], list[Path], list[Path]]: Tuple containing the train set, valid set, and test set as lists of Path objects.

    '''
    train_list = []
    valid_list = []
    test_list = []

    for wav_file in wav_list:
        singer = wav_file.parent.parent.name[0] + wav_file.stem.split('_')[0]
        if singer not in test_spk_list:
            train_list.append(wav_file)
        else:
            test_list.append(wav_file)

    random.shuffle(train_list)

    train_valid_split = int(len(train_list) * train_valid_ratio)

    train_list, valid_list = train_list[:train_valid_split], train_list[train_valid_split:]

    return train_list, valid_list, test_list


def GenMetadata(data_root, wav_list, mode):
    '''
    generate the metadata file for the dataset
    '''
    results = Parallel(n_jobs=10)(delayed(GetMetaInfo)(wav_path) for wav_path in tqdm(wav_list))
    
    with open(data_root/f'{mode}.json', 'w') as f:
        json.dump(results, f)
    
    return


def main(args):
    global data_root, wavlm_dir, pitch_dir, ld_dir
    data_root = Path(args.data_root)
    wavlm_dir = Path(args.wavlm_dir) if args.wavlm_dir is not None else data_root/'wavlm_features'
    pitch_dir = Path(args.pitch_dir) if args.pitch_dir is not None else data_root/'pitch'
    ld_dir = Path(args.ld_dir) if args.ld_dir is not None else data_root/'loudness'
    wav_list = list(data_root.rglob('*.wav'))
    train_list, valid_list, test_list = SplitDataset(wav_list)

    GenMetadata(data_root, train_list, 'train')
    GenMetadata(data_root, valid_list, 'valid')
    GenMetadata(data_root, test_list, 'test')
    
    return


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--data_root',
        required=True, type=str, help='Directory of audios for the dataset.'
    )
    parser.add_argument(
        '--wavlm_dir',
        type=str, help='Directory of wavlm features for the dataset.'
    )
    parser.add_argument(
        '--pitch_dir',
        type=str, help='Directory of pitch for the dataset.'
    )
    parser.add_argument(
        '--ld_dir',
        type=str, help='Directory of loudness for the dataset.'
    )

    args = parser.parse_args()
    main(args)