import numpy as np import torch import torch.nn.functional as F from modules.base import BaseModule from modules.layers import Conv1dWithInitialization from modules.upsampling import UpsamplingBlock as UBlock from modules.downsampling import DownsamplingBlock as DBlock from modules.linear_modulation import FeatureWiseLinearModulation as FiLM from modules.nhv import NeuralHomomorphicVocoder device_str = 'cuda' if torch.cuda.is_available() else 'cpu' class SVCNN(BaseModule): """ WaveGrad is a fully-convolutional mel-spectrogram conditional vocoder model for waveform generation introduced in "WaveGrad: Estimating Gradients for Waveform Generation" paper (link: https://arxiv.org/pdf/2009.00713.pdf). The concept is built on the prior work on score matching and diffusion probabilistic models. Current implementation follows described architecture in the paper. """ def __init__(self, config): super(SVCNN, self).__init__() # Construct NHV module. self.hop_size = config.data_config.hop_size self.noise_std = config.model_config.nhv_noise_std self.nhv_cat_type = config.model_config.nhv_cat_type self.harmonic_type = config.model_config.harmonic_type self.nhv = NeuralHomomorphicVocoder(fs=config.data_config.sampling_rate, hop_size=self.hop_size, in_channels=config.model_config.nhv_inchannels, fmin=80, fmax=7600) # Building upsampling branch (mels -> signal) self.ublock_preconv = Conv1dWithInitialization( in_channels=config.model_config.nhv_inchannels-1, out_channels=config.model_config.upsampling_preconv_out_channels, kernel_size=3, stride=1, padding=1 ) upsampling_in_sizes = [config.model_config.upsampling_preconv_out_channels] \ + config.model_config.upsampling_out_channels[:-1] self.ublocks = torch.nn.ModuleList([ UBlock( in_channels=in_size, out_channels=out_size, factor=factor, dilations=dilations ) for in_size, out_size, factor, dilations in zip( upsampling_in_sizes, config.model_config.upsampling_out_channels, config.model_config.factors, config.model_config.upsampling_dilations ) ]) self.ublock_postconv = Conv1dWithInitialization( in_channels=config.model_config.upsampling_out_channels[-1], out_channels=1, kernel_size=3, stride=1, padding=1 ) # Building downsampling branch (starting from signal) self.ld_dblock_preconv = Conv1dWithInitialization( in_channels=1, out_channels=config.model_config.downsampling_preconv_out_channels, kernel_size=5, stride=1, padding=2 ) self.pitch_dblock_preconv = Conv1dWithInitialization( in_channels=config.model_config.num_harmonic, out_channels=config.model_config.downsampling_preconv_out_channels, kernel_size=5, stride=1, padding=2 ) downsampling_in_sizes = [config.model_config.downsampling_preconv_out_channels] \ + config.model_config.downsampling_out_channels[:-1] self.ld_dblocks = torch.nn.ModuleList([ DBlock( in_channels=in_size, out_channels=out_size, factor=factor, dilations=dilations ) for in_size, out_size, factor, dilations in zip( downsampling_in_sizes, config.model_config.downsampling_out_channels, config.model_config.factors[1:][::-1], config.model_config.downsampling_dilations ) ]) self.pitch_dblocks = torch.nn.ModuleList([ DBlock( in_channels=in_size, out_channels=out_size, factor=factor, dilations=dilations ) for in_size, out_size, factor, dilations in zip( downsampling_in_sizes, config.model_config.downsampling_out_channels, config.model_config.factors[1:][::-1], config.model_config.downsampling_dilations ) ]) # Building FiLM connections (in order of downscaling stream) film_in_sizes = [24] + config.model_config.downsampling_out_channels film_out_sizes = config.model_config.upsampling_out_channels[::-1] film_factors = [1] + config.model_config.factors[1:][::-1] self.ld_films = torch.nn.ModuleList([ FiLM( in_channels=in_size, out_channels=out_size, input_dscaled_by=np.product(film_factors[:i+1]) # for proper positional encodings initialization ) for i, (in_size, out_size) in enumerate( zip(film_in_sizes, film_out_sizes) ) ]) self.pitch_films = torch.nn.ModuleList([ FiLM( in_channels=in_size, out_channels=out_size, input_dscaled_by=np.product(film_factors[:i+1]) # for proper positional encodings initialization ) for i, (in_size, out_size) in enumerate( zip(film_in_sizes, film_out_sizes) ) ]) def forward(self, wavlm, pitch, ld): """ Computes forward pass of neural network. :param mels (torch.Tensor): mel-spectrogram acoustic features of shape [B, n_mels, T//hop_length] :param yn (torch.Tensor): noised signal `y_n` of shape [B, T] :return (torch.Tensor): epsilon noise """ ## Prepare inputs # wavlm: B, 1024, T # pitch: B, T # ld: B, T assert len(wavlm.shape) == 3 # B, n_mels, T pitch = pitch.unsqueeze(1) ld = ld.unsqueeze(1) assert len(pitch.shape) == 3 # B, 1, T assert len(ld.shape) == 3 # B, 1, T # Generate NHV conditions if self.nhv_cat_type == 'PLS': nhv_ld = ld nhv_wavlm = F.interpolate(wavlm, size=nhv_ld.shape[2], mode='nearest') nhv_conditions = torch.cat((nhv_ld, nhv_wavlm), dim=1) # B, (1+1024), T else: raise NameError('Unknown nhv cat type: {self.nhv_cat_type}') nhv_conditions = nhv_conditions.transpose(1, 2) # B, T, n_emb # Generate NHV harmonic signals nhv_noise = torch.normal(0, self.noise_std, (nhv_conditions.size(0), 1, nhv_conditions.size(1)*self.hop_size)).to(nhv_conditions.device) nhv_pitch = pitch.transpose(1, 2) # B, T, 1 raw_harmonic, filtered_harmonic = self.nhv(nhv_noise, nhv_conditions, nhv_pitch) # Linear interpolate loudness to audio_rate upsampled_ld = F.interpolate(ld, scale_factor=self.hop_size, mode='linear') if self.harmonic_type == 0: upsampled_pitch = raw_harmonic elif self.harmonic_type == 1: upsampled_pitch = filtered_harmonic elif self.harmonic_type == 2: upsampled_pitch = torch.cat((raw_harmonic, filtered_harmonic), dim=1) else: raise NameError(f'unknown harmonic type: {self.harmonic_type}') # Downsampling stream + Linear Modulation statistics calculation ld_statistics = [] dblock_outputs = self.ld_dblock_preconv(upsampled_ld) scale, shift = self.ld_films[0](x=dblock_outputs) ld_statistics.append([scale, shift]) for dblock, film in zip(self.ld_dblocks, self.ld_films[1:]): dblock_outputs = dblock(dblock_outputs) scale, shift = film(x=dblock_outputs) ld_statistics.append([scale, shift]) ld_statistics = ld_statistics[::-1] pitch_statistics = [] dblock_outputs = self.pitch_dblock_preconv(upsampled_pitch) scale, shift = self.pitch_films[0](x=dblock_outputs) pitch_statistics.append([scale, shift]) for dblock, film in zip(self.pitch_dblocks, self.pitch_films[1:]): dblock_outputs = dblock(dblock_outputs) scale, shift = film(x=dblock_outputs) pitch_statistics.append([scale, shift]) pitch_statistics = pitch_statistics[::-1] # Upsampling stream condition = wavlm ublock_outputs = self.ublock_preconv(condition) for i, ublock in enumerate(self.ublocks): ld_scale, ld_shift = ld_statistics[i] pitch_scale, pitch_shift = pitch_statistics[i] ublock_outputs = ublock(x=ublock_outputs, scale=ld_scale+pitch_scale, shift=ld_shift+pitch_shift) outputs = self.ublock_postconv(ublock_outputs) return outputs.squeeze(1)