|
""" |
|
This file contains a very simple TDNN module to use for speaker-id. |
|
|
|
To replace this model, change the `!new:` tag in the hyperparameter file |
|
to refer to a built-in SpeechBrain model or another file containing |
|
a custom PyTorch module. |
|
|
|
Authors |
|
* Nauman Dawalatabad 2020 |
|
* Mirco Ravanelli 2020 |
|
""" |
|
|
|
import torch |
|
import torch.nn as nn |
|
|
|
import speechbrain as sb |
|
from speechbrain.nnet.CNN import Conv1d |
|
from speechbrain.nnet.linear import Linear |
|
from speechbrain.nnet.normalization import BatchNorm1d |
|
from speechbrain.nnet.pooling import StatisticsPooling |
|
|
|
|
|
class Xvector(torch.nn.Module): |
|
"""This model extracts X-vectors for speaker recognition |
|
|
|
Arguments |
|
--------- |
|
device : str |
|
The device to place this model on (e.g. "cpu" or "cuda") |
|
activation : torch class |
|
A class for constructing the activation layers. |
|
tdnn_blocks : int |
|
Number of time-delay neural (TDNN) layers. |
|
tdnn_channels : list of ints |
|
Output channels for TDNN layer. |
|
tdnn_kernel_sizes : list of ints |
|
List of kernel sizes for each TDNN layer. |
|
tdnn_dilations : list of ints |
|
List of dilations for kernels in each TDNN layer. |
|
lin_neurons : int |
|
Number of neurons in linear layers. |
|
in_channels : int |
|
Number of channels expected in the input. |
|
|
|
Example |
|
------- |
|
>>> compute_xvect = Xvector() |
|
>>> input_feats = torch.rand([5, 10, 40]) |
|
>>> outputs = compute_xvect(input_feats) |
|
>>> outputs.shape |
|
torch.Size([5, 1, 512]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
device="cpu", |
|
activation=torch.nn.LeakyReLU, |
|
tdnn_blocks=5, |
|
tdnn_channels=[512, 512, 512, 512, 1500], |
|
tdnn_kernel_sizes=[5, 3, 3, 1, 1], |
|
tdnn_dilations=[1, 2, 3, 1, 1], |
|
lin_neurons=512, |
|
in_channels=40, |
|
): |
|
|
|
super().__init__() |
|
self.blocks = nn.ModuleList() |
|
|
|
|
|
|
|
|
|
|
|
|
|
for block_index in range(tdnn_blocks): |
|
out_channels = tdnn_channels[block_index] |
|
self.blocks.extend( |
|
[ |
|
Conv1d( |
|
in_channels=in_channels, |
|
out_channels=out_channels, |
|
kernel_size=tdnn_kernel_sizes[block_index], |
|
dilation=tdnn_dilations[block_index], |
|
), |
|
activation(), |
|
BatchNorm1d(input_size=out_channels), |
|
] |
|
) |
|
in_channels = tdnn_channels[block_index] |
|
|
|
|
|
|
|
|
|
self.blocks.append(StatisticsPooling()) |
|
|
|
|
|
self.blocks.append( |
|
Linear( |
|
input_size=out_channels * 2, |
|
n_neurons=lin_neurons, |
|
bias=True, |
|
combine_dims=False, |
|
) |
|
) |
|
|
|
def forward(self, x, lens=None): |
|
"""Returns the x-vectors. |
|
|
|
Arguments |
|
--------- |
|
x : torch.Tensor |
|
The input features for computation. |
|
lens : torch.Tensor |
|
The length of the corresponding inputs. |
|
|
|
Returns |
|
------- |
|
The computed x-vectors |
|
""" |
|
for layer in self.blocks: |
|
try: |
|
x = layer(x, lengths=lens) |
|
except TypeError: |
|
x = layer(x) |
|
return x |
|
|
|
|
|
class Classifier(sb.nnet.containers.Sequential): |
|
"""This class implements the last MLP on the top of xvector features. |
|
Arguments |
|
--------- |
|
input_shape : tuple |
|
Expected shape of an example input. |
|
activation : torch class |
|
A class for constructing the activation layers. |
|
lin_blocks : int |
|
Number of linear layers. |
|
lin_neurons : int |
|
Number of neurons in linear layers. |
|
out_neurons : int |
|
Number of output neurons. |
|
|
|
Example |
|
------- |
|
>>> input_feats = torch.rand([5, 10, 40]) |
|
>>> compute_xvect = Xvector() |
|
>>> xvects = compute_xvect(input_feats) |
|
>>> classify = Classifier(input_shape=xvects.shape) |
|
>>> output = classify(xvects) |
|
>>> output.shape |
|
torch.Size([5, 1, 1211]) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
input_shape, |
|
activation=torch.nn.LeakyReLU, |
|
lin_blocks=1, |
|
lin_neurons=512, |
|
out_neurons=1211, |
|
): |
|
super().__init__(input_shape=input_shape) |
|
|
|
self.append(activation(), layer_name="act") |
|
self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm") |
|
|
|
if lin_blocks > 0: |
|
self.append(sb.nnet.containers.Sequential, layer_name="DNN") |
|
|
|
|
|
for block_index in range(lin_blocks): |
|
block_name = f"block_{block_index}" |
|
self.DNN.append( |
|
sb.nnet.containers.Sequential, layer_name=block_name |
|
) |
|
self.DNN[block_name].append( |
|
sb.nnet.linear.Linear, |
|
n_neurons=lin_neurons, |
|
bias=True, |
|
layer_name="linear", |
|
) |
|
self.DNN[block_name].append(activation(), layer_name="act") |
|
self.DNN[block_name].append( |
|
sb.nnet.normalization.BatchNorm1d, layer_name="norm" |
|
) |
|
|
|
|
|
self.append( |
|
sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out" |
|
) |
|
self.append( |
|
sb.nnet.activations.Softmax(apply_log=True), layer_name="softmax" |
|
) |
|
|