Clonar-voz-guaratuba / TTS /vocoder /models /melgan_discriminator.py
Shadhil's picture
voice-clone with single audio sample input
9b2107c
import numpy as np
from torch import nn
from torch.nn.utils.parametrizations import weight_norm
class MelganDiscriminator(nn.Module):
def __init__(
self,
in_channels=1,
out_channels=1,
kernel_sizes=(5, 3),
base_channels=16,
max_channels=1024,
downsample_factors=(4, 4, 4, 4),
groups_denominator=4,
):
super().__init__()
self.layers = nn.ModuleList()
layer_kernel_size = np.prod(kernel_sizes)
layer_padding = (layer_kernel_size - 1) // 2
# initial layer
self.layers += [
nn.Sequential(
nn.ReflectionPad1d(layer_padding),
weight_norm(nn.Conv1d(in_channels, base_channels, layer_kernel_size, stride=1)),
nn.LeakyReLU(0.2, inplace=True),
)
]
# downsampling layers
layer_in_channels = base_channels
for downsample_factor in downsample_factors:
layer_out_channels = min(layer_in_channels * downsample_factor, max_channels)
layer_kernel_size = downsample_factor * 10 + 1
layer_padding = (layer_kernel_size - 1) // 2
layer_groups = layer_in_channels // groups_denominator
self.layers += [
nn.Sequential(
weight_norm(
nn.Conv1d(
layer_in_channels,
layer_out_channels,
kernel_size=layer_kernel_size,
stride=downsample_factor,
padding=layer_padding,
groups=layer_groups,
)
),
nn.LeakyReLU(0.2, inplace=True),
)
]
layer_in_channels = layer_out_channels
# last 2 layers
layer_padding1 = (kernel_sizes[0] - 1) // 2
layer_padding2 = (kernel_sizes[1] - 1) // 2
self.layers += [
nn.Sequential(
weight_norm(
nn.Conv1d(
layer_out_channels,
layer_out_channels,
kernel_size=kernel_sizes[0],
stride=1,
padding=layer_padding1,
)
),
nn.LeakyReLU(0.2, inplace=True),
),
weight_norm(
nn.Conv1d(
layer_out_channels, out_channels, kernel_size=kernel_sizes[1], stride=1, padding=layer_padding2
)
),
]
def forward(self, x):
feats = []
for layer in self.layers:
x = layer(x)
feats.append(x)
return x, feats