NeuCoSVC-Colab / dataset /dataset.py
kevinwang676's picture
Upload folder using huggingface_hub
cfdc687
import numpy as np
import torch
from torch.utils.data import Dataset
import json
import random
from pathlib import Path
import soundfile as sf
class SVCDataset(Dataset):
def __init__(self, root, n_samples, sampling_rate, hop_size, mode):
self.root = Path(root)
self.n_samples = n_samples
self.sampling_rate = sampling_rate
self.hop_size = hop_size
self.n_frames = int(n_samples/hop_size)
with open(self.root / f"{mode}.json") as file:
metadata = json.load(file)
self.metadata = []
for audio_path, wavlm_path, pitch_path, ld_path in metadata:
self.metadata.append([audio_path, wavlm_path, pitch_path, ld_path])
print(mode, 'n_samples:', n_samples, 'metadata:', len(self.metadata))
random.shuffle(self.metadata)
def load_wav(self, audio_path):
wav, fs = sf.read(audio_path)
assert fs == self.sampling_rate, f'Audio {audio_path} sampling rate is not {self.sampling_rate} Hz.'
peak = np.abs(wav).max()
if peak > 1.0:
wav /= peak
return wav
def __len__(self):
return len(self.metadata)
def __getitem__(self, index):
audio_path, wavlm_path, pitch_path, ld_path = self.metadata[index]
audio = self.load_wav(audio_path)
wavlm = torch.load(wavlm_path)
if isinstance(wavlm, torch.Tensor):
wavlm = wavlm.numpy().T # (1024, T)
else:
wavlm = np.squeeze(wavlm)
pitch = np.load(pitch_path)
ld = np.load(ld_path)
wavlm_frames = int(self.n_frames/2)
assert pitch.shape[0] == ld.shape[0], f'{audio_path}: Length Mismatch: pitch length ({pitch.shape[0]}), ld length ({ld.shape[0]})'
# Align features, the hop size for wavlm is 20 ms, while the hop size for pitch/ld is 10 ms.
seq_len = wavlm.shape[-1] * 2
if seq_len > pitch.shape[0]:
p = seq_len - pitch.shape[0]
pitch = np.pad(pitch, (0, p), mode='edge')
ld = np.pad(ld, (0, p), mode='edge')
else:
pitch = pitch[:seq_len]
ld = ld[:seq_len]
# To ensure upsampling/downsampling will be processed in a right way for full signals
p = seq_len * self.hop_size - audio.shape[-1]
if p > 0:
audio = np.pad(audio, (0, p), mode='reflect')
else:
audio = audio[:seq_len * self.hop_size]
if audio.shape[0] >= self.n_samples:
pos = random.randint(0, wavlm.shape[-1] - wavlm_frames)
wavlm = wavlm[:, pos:pos+wavlm_frames]
pitch = pitch[pos*2:pos*2+self.n_frames]
ld = ld[pos*2:pos*2+self.n_frames]
audio = audio[pos*2*self.hop_size:(pos*2+self.n_frames)*self.hop_size]
else:
wavlm = np.pad(wavlm, ((0, 0), (0, wavlm_frames - wavlm.shape[-1])), mode='edge')
pitch = np.pad(pitch, (0, self.n_frames-pitch.shape[0]), mode='edge')
ld = np.pad(ld, (0, self.n_frames-ld.shape[0]), mode='edge')
audio = np.pad(audio, (0, self.n_samples-audio.shape[0]), mode='edge')
assert audio.shape[0] == self.n_samples, f'{audio_path}: audio length is not enough, {wavlm.shape}, {audio.shape}, {p}'
assert pitch.shape[0] == self.n_frames, f'{audio_path}: pitch length is not enough, {wavlm.shape}, {pitch.shape}, {self.n_frames}'
assert ld.shape[0] == self.n_frames, f'{audio_path}: ld length is not enough, {wavlm.shape}, {ld.shape}, {self.n_frames}'
assert wavlm.shape[-1] == wavlm_frames, f'{audio_path}: wavlm length is not enough, {wavlm.shape}, {self.n_frames}'
return (torch.from_numpy(wavlm).to(dtype=torch.float), torch.from_numpy(pitch).to(dtype=torch.float), torch.from_numpy(ld).to(dtype=torch.float)), torch.from_numpy(audio).to(dtype=torch.float)