Shadhil's picture
voice-clone with single audio sample input
9b2107c
import torch
from torch import nn
from torch.nn import functional as F
class Linear(nn.Module):
"""Linear layer with a specific initialization.
Args:
in_features (int): number of channels in the input tensor.
out_features (int): number of channels in the output tensor.
bias (bool, optional): enable/disable bias in the layer. Defaults to True.
init_gain (str, optional): method to compute the gain in the weight initializtion based on the nonlinear activation used afterwards. Defaults to 'linear'.
"""
def __init__(self, in_features, out_features, bias=True, init_gain="linear"):
super().__init__()
self.linear_layer = torch.nn.Linear(in_features, out_features, bias=bias)
self._init_w(init_gain)
def _init_w(self, init_gain):
torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(init_gain))
def forward(self, x):
return self.linear_layer(x)
class LinearBN(nn.Module):
"""Linear layer with Batch Normalization.
x -> linear -> BN -> o
Args:
in_features (int): number of channels in the input tensor.
out_features (int ): number of channels in the output tensor.
bias (bool, optional): enable/disable bias in the linear layer. Defaults to True.
init_gain (str, optional): method to set the gain for weight initialization. Defaults to 'linear'.
"""
def __init__(self, in_features, out_features, bias=True, init_gain="linear"):
super().__init__()
self.linear_layer = torch.nn.Linear(in_features, out_features, bias=bias)
self.batch_normalization = nn.BatchNorm1d(out_features, momentum=0.1, eps=1e-5)
self._init_w(init_gain)
def _init_w(self, init_gain):
torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(init_gain))
def forward(self, x):
"""
Shapes:
x: [T, B, C] or [B, C]
"""
out = self.linear_layer(x)
if len(out.shape) == 3:
out = out.permute(1, 2, 0)
out = self.batch_normalization(out)
if len(out.shape) == 3:
out = out.permute(2, 0, 1)
return out
class Prenet(nn.Module):
"""Tacotron specific Prenet with an optional Batch Normalization.
Note:
Prenet with BN improves the model performance significantly especially
if it is enabled after learning a diagonal attention alignment with the original
prenet. However, if the target dataset is high quality then it also works from
the start. It is also suggested to disable dropout if BN is in use.
prenet_type == "original"
x -> [linear -> ReLU -> Dropout]xN -> o
prenet_type == "bn"
x -> [linear -> BN -> ReLU -> Dropout]xN -> o
Args:
in_features (int): number of channels in the input tensor and the inner layers.
prenet_type (str, optional): prenet type "original" or "bn". Defaults to "original".
prenet_dropout (bool, optional): dropout rate. Defaults to True.
dropout_at_inference (bool, optional): use dropout at inference. It leads to a better quality for some models.
out_features (list, optional): List of output channels for each prenet block.
It also defines number of the prenet blocks based on the length of argument list.
Defaults to [256, 256].
bias (bool, optional): enable/disable bias in prenet linear layers. Defaults to True.
"""
# pylint: disable=dangerous-default-value
def __init__(
self,
in_features,
prenet_type="original",
prenet_dropout=True,
dropout_at_inference=False,
out_features=[256, 256],
bias=True,
):
super().__init__()
self.prenet_type = prenet_type
self.prenet_dropout = prenet_dropout
self.dropout_at_inference = dropout_at_inference
in_features = [in_features] + out_features[:-1]
if prenet_type == "bn":
self.linear_layers = nn.ModuleList(
[LinearBN(in_size, out_size, bias=bias) for (in_size, out_size) in zip(in_features, out_features)]
)
elif prenet_type == "original":
self.linear_layers = nn.ModuleList(
[Linear(in_size, out_size, bias=bias) for (in_size, out_size) in zip(in_features, out_features)]
)
def forward(self, x):
for linear in self.linear_layers:
if self.prenet_dropout:
x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training or self.dropout_at_inference)
else:
x = F.relu(linear(x))
return x