jefson08
/

spkrec-ecapa-voxceleb-kha

Model card Files Files and versions Community

spkrec-ecapa-voxceleb-kha / custom_model.py

jefson08

Upload 12 files

2eb2aec verified 5 months ago

raw

history blame contribute delete

5.98 kB

	"""
	This file contains a very simple TDNN module to use for speaker-id.

	To replace this model, change the `!new:` tag in the hyperparameter file
	to refer to a built-in SpeechBrain model or another file containing
	a custom PyTorch module.

	Authors
	* Nauman Dawalatabad 2020
	* Mirco Ravanelli 2020
	"""

	import torch # noqa: F401
	import torch.nn as nn

	import speechbrain as sb
	from speechbrain.nnet.CNN import Conv1d
	from speechbrain.nnet.linear import Linear
	from speechbrain.nnet.normalization import BatchNorm1d
	from speechbrain.nnet.pooling import StatisticsPooling


	class Xvector(torch.nn.Module):
	"""This model extracts X-vectors for speaker recognition

	Arguments
	---------
	device : str
	The device to place this model on (e.g. "cpu" or "cuda")
	activation : torch class
	A class for constructing the activation layers.
	tdnn_blocks : int
	Number of time-delay neural (TDNN) layers.
	tdnn_channels : list of ints
	Output channels for TDNN layer.
	tdnn_kernel_sizes : list of ints
	List of kernel sizes for each TDNN layer.
	tdnn_dilations : list of ints
	List of dilations for kernels in each TDNN layer.
	lin_neurons : int
	Number of neurons in linear layers.
	in_channels : int
	Number of channels expected in the input.

	Example
	-------
	>>> compute_xvect = Xvector()
	>>> input_feats = torch.rand([5, 10, 40])
	>>> outputs = compute_xvect(input_feats)
	>>> outputs.shape
	torch.Size([5, 1, 512])
	"""

	def __init__(
	self,
	device="cpu",
	activation=torch.nn.LeakyReLU,
	tdnn_blocks=5,
	tdnn_channels=[512, 512, 512, 512, 1500],
	tdnn_kernel_sizes=[5, 3, 3, 1, 1],
	tdnn_dilations=[1, 2, 3, 1, 1],
	lin_neurons=512,
	in_channels=40,
	):

	super().__init__()
	self.blocks = nn.ModuleList()

	# TDNN has convolutional layers with the given dilation factors
	# and kernel sizes. We here loop over all the convolutional layers
	# that we wanna add. Note that batch normalization is used after
	# the activations function in this case. This improves the
	# speaker-id performance a bit.
	for block_index in range(tdnn_blocks):
	out_channels = tdnn_channels[block_index]
	self.blocks.extend(
	[
	Conv1d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=tdnn_kernel_sizes[block_index],
	dilation=tdnn_dilations[block_index],
	),
	activation(),
	BatchNorm1d(input_size=out_channels),
	]
	)
	in_channels = tdnn_channels[block_index]

	# Statistical pooling. It converts a tensor of variable length
	# into a fixed-length tensor. The statistical pooling returns the
	# mean and the standard deviation.
	self.blocks.append(StatisticsPooling())

	# Final linear transformation.
	self.blocks.append(
	Linear(
	input_size=out_channels * 2, # mean + std,
	n_neurons=lin_neurons,
	bias=True,
	combine_dims=False,
	)
	)

	def forward(self, x, lens=None):
	"""Returns the x-vectors.

	Arguments
	---------
	x : torch.Tensor
	The input features for computation.
	lens : torch.Tensor
	The length of the corresponding inputs.

	Returns
	-------
	The computed x-vectors
	"""
	for layer in self.blocks:
	try:
	x = layer(x, lengths=lens)
	except TypeError:
	x = layer(x)
	return x


	class Classifier(sb.nnet.containers.Sequential):
	"""This class implements the last MLP on the top of xvector features.
	Arguments
	---------
	input_shape : tuple
	Expected shape of an example input.
	activation : torch class
	A class for constructing the activation layers.
	lin_blocks : int
	Number of linear layers.
	lin_neurons : int
	Number of neurons in linear layers.
	out_neurons : int
	Number of output neurons.

	Example
	-------
	>>> input_feats = torch.rand([5, 10, 40])
	>>> compute_xvect = Xvector()
	>>> xvects = compute_xvect(input_feats)
	>>> classify = Classifier(input_shape=xvects.shape)
	>>> output = classify(xvects)
	>>> output.shape
	torch.Size([5, 1, 1211])
	"""

	def __init__(
	self,
	input_shape,
	activation=torch.nn.LeakyReLU,
	lin_blocks=1,
	lin_neurons=512,
	out_neurons=1211,
	):
	super().__init__(input_shape=input_shape)

	self.append(activation(), layer_name="act")
	self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm")

	if lin_blocks > 0:
	self.append(sb.nnet.containers.Sequential, layer_name="DNN")

	# Adding fully-connected layers
	for block_index in range(lin_blocks):
	block_name = f"block_{block_index}"
	self.DNN.append(
	sb.nnet.containers.Sequential, layer_name=block_name
	)
	self.DNN[block_name].append(
	sb.nnet.linear.Linear,
	n_neurons=lin_neurons,
	bias=True,
	layer_name="linear",
	)
	self.DNN[block_name].append(activation(), layer_name="act")
	self.DNN[block_name].append(
	sb.nnet.normalization.BatchNorm1d, layer_name="norm"
	)

	# Final Softmax classifier
	self.append(
	sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out"
	)
	self.append(
	sb.nnet.activations.Softmax(apply_log=True), layer_name="softmax"
	)