NeuCoSVC-Colab / dataset /dataset.py
kevinwang676's picture
Upload folder using huggingface_hub
cfdc687
raw
history blame
3.88 kB
import numpy as np
import torch
from torch.utils.data import Dataset
import json
import random
from pathlib import Path
import soundfile as sf
class SVCDataset(Dataset):
def __init__(self, root, n_samples, sampling_rate, hop_size, mode):
self.root = Path(root)
self.n_samples = n_samples
self.sampling_rate = sampling_rate
self.hop_size = hop_size
self.n_frames = int(n_samples/hop_size)
with open(self.root / f"{mode}.json") as file:
metadata = json.load(file)
self.metadata = []
for audio_path, wavlm_path, pitch_path, ld_path in metadata:
self.metadata.append([audio_path, wavlm_path, pitch_path, ld_path])
print(mode, 'n_samples:', n_samples, 'metadata:', len(self.metadata))
random.shuffle(self.metadata)
def load_wav(self, audio_path):
wav, fs = sf.read(audio_path)
assert fs == self.sampling_rate, f'Audio {audio_path} sampling rate is not {self.sampling_rate} Hz.'
peak = np.abs(wav).max()
if peak > 1.0:
wav /= peak
return wav
def __len__(self):
return len(self.metadata)
def __getitem__(self, index):
audio_path, wavlm_path, pitch_path, ld_path = self.metadata[index]
audio = self.load_wav(audio_path)
wavlm = torch.load(wavlm_path)
if isinstance(wavlm, torch.Tensor):
wavlm = wavlm.numpy().T # (1024, T)
else:
wavlm = np.squeeze(wavlm)
pitch = np.load(pitch_path)
ld = np.load(ld_path)
wavlm_frames = int(self.n_frames/2)
assert pitch.shape[0] == ld.shape[0], f'{audio_path}: Length Mismatch: pitch length ({pitch.shape[0]}), ld length ({ld.shape[0]})'
# Align features, the hop size for wavlm is 20 ms, while the hop size for pitch/ld is 10 ms.
seq_len = wavlm.shape[-1] * 2
if seq_len > pitch.shape[0]:
p = seq_len - pitch.shape[0]
pitch = np.pad(pitch, (0, p), mode='edge')
ld = np.pad(ld, (0, p), mode='edge')
else:
pitch = pitch[:seq_len]
ld = ld[:seq_len]
# To ensure upsampling/downsampling will be processed in a right way for full signals
p = seq_len * self.hop_size - audio.shape[-1]
if p > 0:
audio = np.pad(audio, (0, p), mode='reflect')
else:
audio = audio[:seq_len * self.hop_size]
if audio.shape[0] >= self.n_samples:
pos = random.randint(0, wavlm.shape[-1] - wavlm_frames)
wavlm = wavlm[:, pos:pos+wavlm_frames]
pitch = pitch[pos*2:pos*2+self.n_frames]
ld = ld[pos*2:pos*2+self.n_frames]
audio = audio[pos*2*self.hop_size:(pos*2+self.n_frames)*self.hop_size]
else:
wavlm = np.pad(wavlm, ((0, 0), (0, wavlm_frames - wavlm.shape[-1])), mode='edge')
pitch = np.pad(pitch, (0, self.n_frames-pitch.shape[0]), mode='edge')
ld = np.pad(ld, (0, self.n_frames-ld.shape[0]), mode='edge')
audio = np.pad(audio, (0, self.n_samples-audio.shape[0]), mode='edge')
assert audio.shape[0] == self.n_samples, f'{audio_path}: audio length is not enough, {wavlm.shape}, {audio.shape}, {p}'
assert pitch.shape[0] == self.n_frames, f'{audio_path}: pitch length is not enough, {wavlm.shape}, {pitch.shape}, {self.n_frames}'
assert ld.shape[0] == self.n_frames, f'{audio_path}: ld length is not enough, {wavlm.shape}, {ld.shape}, {self.n_frames}'
assert wavlm.shape[-1] == wavlm_frames, f'{audio_path}: wavlm length is not enough, {wavlm.shape}, {self.n_frames}'
return (torch.from_numpy(wavlm).to(dtype=torch.float), torch.from_numpy(pitch).to(dtype=torch.float), torch.from_numpy(ld).to(dtype=torch.float)), torch.from_numpy(audio).to(dtype=torch.float)