Spaces:
Running
Running
import numpy as np | |
import torch | |
from torch.utils.data import Dataset | |
import json | |
import random | |
from pathlib import Path | |
import soundfile as sf | |
class SVCDataset(Dataset): | |
def __init__(self, root, n_samples, sampling_rate, hop_size, mode): | |
self.root = Path(root) | |
self.n_samples = n_samples | |
self.sampling_rate = sampling_rate | |
self.hop_size = hop_size | |
self.n_frames = int(n_samples/hop_size) | |
with open(self.root / f"{mode}.json") as file: | |
metadata = json.load(file) | |
self.metadata = [] | |
for audio_path, wavlm_path, pitch_path, ld_path in metadata: | |
self.metadata.append([audio_path, wavlm_path, pitch_path, ld_path]) | |
print(mode, 'n_samples:', n_samples, 'metadata:', len(self.metadata)) | |
random.shuffle(self.metadata) | |
def load_wav(self, audio_path): | |
wav, fs = sf.read(audio_path) | |
assert fs == self.sampling_rate, f'Audio {audio_path} sampling rate is not {self.sampling_rate} Hz.' | |
peak = np.abs(wav).max() | |
if peak > 1.0: | |
wav /= peak | |
return wav | |
def __len__(self): | |
return len(self.metadata) | |
def __getitem__(self, index): | |
audio_path, wavlm_path, pitch_path, ld_path = self.metadata[index] | |
audio = self.load_wav(audio_path) | |
wavlm = torch.load(wavlm_path) | |
if isinstance(wavlm, torch.Tensor): | |
wavlm = wavlm.numpy().T # (1024, T) | |
else: | |
wavlm = np.squeeze(wavlm) | |
pitch = np.load(pitch_path) | |
ld = np.load(ld_path) | |
wavlm_frames = int(self.n_frames/2) | |
assert pitch.shape[0] == ld.shape[0], f'{audio_path}: Length Mismatch: pitch length ({pitch.shape[0]}), ld length ({ld.shape[0]})' | |
# Align features, the hop size for wavlm is 20 ms, while the hop size for pitch/ld is 10 ms. | |
seq_len = wavlm.shape[-1] * 2 | |
if seq_len > pitch.shape[0]: | |
p = seq_len - pitch.shape[0] | |
pitch = np.pad(pitch, (0, p), mode='edge') | |
ld = np.pad(ld, (0, p), mode='edge') | |
else: | |
pitch = pitch[:seq_len] | |
ld = ld[:seq_len] | |
# To ensure upsampling/downsampling will be processed in a right way for full signals | |
p = seq_len * self.hop_size - audio.shape[-1] | |
if p > 0: | |
audio = np.pad(audio, (0, p), mode='reflect') | |
else: | |
audio = audio[:seq_len * self.hop_size] | |
if audio.shape[0] >= self.n_samples: | |
pos = random.randint(0, wavlm.shape[-1] - wavlm_frames) | |
wavlm = wavlm[:, pos:pos+wavlm_frames] | |
pitch = pitch[pos*2:pos*2+self.n_frames] | |
ld = ld[pos*2:pos*2+self.n_frames] | |
audio = audio[pos*2*self.hop_size:(pos*2+self.n_frames)*self.hop_size] | |
else: | |
wavlm = np.pad(wavlm, ((0, 0), (0, wavlm_frames - wavlm.shape[-1])), mode='edge') | |
pitch = np.pad(pitch, (0, self.n_frames-pitch.shape[0]), mode='edge') | |
ld = np.pad(ld, (0, self.n_frames-ld.shape[0]), mode='edge') | |
audio = np.pad(audio, (0, self.n_samples-audio.shape[0]), mode='edge') | |
assert audio.shape[0] == self.n_samples, f'{audio_path}: audio length is not enough, {wavlm.shape}, {audio.shape}, {p}' | |
assert pitch.shape[0] == self.n_frames, f'{audio_path}: pitch length is not enough, {wavlm.shape}, {pitch.shape}, {self.n_frames}' | |
assert ld.shape[0] == self.n_frames, f'{audio_path}: ld length is not enough, {wavlm.shape}, {ld.shape}, {self.n_frames}' | |
assert wavlm.shape[-1] == wavlm_frames, f'{audio_path}: wavlm length is not enough, {wavlm.shape}, {self.n_frames}' | |
return (torch.from_numpy(wavlm).to(dtype=torch.float), torch.from_numpy(pitch).to(dtype=torch.float), torch.from_numpy(ld).to(dtype=torch.float)), torch.from_numpy(audio).to(dtype=torch.float) |