spkrec-ecapa-voxceleb-kha / custom_model.py
jefson08's picture
Upload 12 files
2eb2aec verified
"""
This file contains a very simple TDNN module to use for speaker-id.
To replace this model, change the `!new:` tag in the hyperparameter file
to refer to a built-in SpeechBrain model or another file containing
a custom PyTorch module.
Authors
* Nauman Dawalatabad 2020
* Mirco Ravanelli 2020
"""
import torch # noqa: F401
import torch.nn as nn
import speechbrain as sb
from speechbrain.nnet.CNN import Conv1d
from speechbrain.nnet.linear import Linear
from speechbrain.nnet.normalization import BatchNorm1d
from speechbrain.nnet.pooling import StatisticsPooling
class Xvector(torch.nn.Module):
"""This model extracts X-vectors for speaker recognition
Arguments
---------
device : str
The device to place this model on (e.g. "cpu" or "cuda")
activation : torch class
A class for constructing the activation layers.
tdnn_blocks : int
Number of time-delay neural (TDNN) layers.
tdnn_channels : list of ints
Output channels for TDNN layer.
tdnn_kernel_sizes : list of ints
List of kernel sizes for each TDNN layer.
tdnn_dilations : list of ints
List of dilations for kernels in each TDNN layer.
lin_neurons : int
Number of neurons in linear layers.
in_channels : int
Number of channels expected in the input.
Example
-------
>>> compute_xvect = Xvector()
>>> input_feats = torch.rand([5, 10, 40])
>>> outputs = compute_xvect(input_feats)
>>> outputs.shape
torch.Size([5, 1, 512])
"""
def __init__(
self,
device="cpu",
activation=torch.nn.LeakyReLU,
tdnn_blocks=5,
tdnn_channels=[512, 512, 512, 512, 1500],
tdnn_kernel_sizes=[5, 3, 3, 1, 1],
tdnn_dilations=[1, 2, 3, 1, 1],
lin_neurons=512,
in_channels=40,
):
super().__init__()
self.blocks = nn.ModuleList()
# TDNN has convolutional layers with the given dilation factors
# and kernel sizes. We here loop over all the convolutional layers
# that we wanna add. Note that batch normalization is used after
# the activations function in this case. This improves the
# speaker-id performance a bit.
for block_index in range(tdnn_blocks):
out_channels = tdnn_channels[block_index]
self.blocks.extend(
[
Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=tdnn_kernel_sizes[block_index],
dilation=tdnn_dilations[block_index],
),
activation(),
BatchNorm1d(input_size=out_channels),
]
)
in_channels = tdnn_channels[block_index]
# Statistical pooling. It converts a tensor of variable length
# into a fixed-length tensor. The statistical pooling returns the
# mean and the standard deviation.
self.blocks.append(StatisticsPooling())
# Final linear transformation.
self.blocks.append(
Linear(
input_size=out_channels * 2, # mean + std,
n_neurons=lin_neurons,
bias=True,
combine_dims=False,
)
)
def forward(self, x, lens=None):
"""Returns the x-vectors.
Arguments
---------
x : torch.Tensor
The input features for computation.
lens : torch.Tensor
The length of the corresponding inputs.
Returns
-------
The computed x-vectors
"""
for layer in self.blocks:
try:
x = layer(x, lengths=lens)
except TypeError:
x = layer(x)
return x
class Classifier(sb.nnet.containers.Sequential):
"""This class implements the last MLP on the top of xvector features.
Arguments
---------
input_shape : tuple
Expected shape of an example input.
activation : torch class
A class for constructing the activation layers.
lin_blocks : int
Number of linear layers.
lin_neurons : int
Number of neurons in linear layers.
out_neurons : int
Number of output neurons.
Example
-------
>>> input_feats = torch.rand([5, 10, 40])
>>> compute_xvect = Xvector()
>>> xvects = compute_xvect(input_feats)
>>> classify = Classifier(input_shape=xvects.shape)
>>> output = classify(xvects)
>>> output.shape
torch.Size([5, 1, 1211])
"""
def __init__(
self,
input_shape,
activation=torch.nn.LeakyReLU,
lin_blocks=1,
lin_neurons=512,
out_neurons=1211,
):
super().__init__(input_shape=input_shape)
self.append(activation(), layer_name="act")
self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm")
if lin_blocks > 0:
self.append(sb.nnet.containers.Sequential, layer_name="DNN")
# Adding fully-connected layers
for block_index in range(lin_blocks):
block_name = f"block_{block_index}"
self.DNN.append(
sb.nnet.containers.Sequential, layer_name=block_name
)
self.DNN[block_name].append(
sb.nnet.linear.Linear,
n_neurons=lin_neurons,
bias=True,
layer_name="linear",
)
self.DNN[block_name].append(activation(), layer_name="act")
self.DNN[block_name].append(
sb.nnet.normalization.BatchNorm1d, layer_name="norm"
)
# Final Softmax classifier
self.append(
sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out"
)
self.append(
sb.nnet.activations.Softmax(apply_log=True), layer_name="softmax"
)