jefson08
/

spkrec-ecapa-voxceleb-kha

Model card Files Files and versions Community

jefson08 commited on Aug 12, 2024

Commit

2eb2aec

verified ·

1 Parent(s): ccfb195

Upload 12 files

Browse files

Files changed (12) hide show

CKPT.yaml +5 -0
brain.ckpt +3 -0
classifier.ckpt +3 -0
counter.ckpt +3 -0
custom_model.py +192 -0
dataloader-TRAIN.ckpt +3 -0
embedding_model.ckpt +3 -0
hparams_inference.yaml +66 -0
label_encoder.ckpt +3 -0
label_encoder.txt +3 -0
normalizer.ckpt +3 -0
optimizer.ckpt +3 -0

CKPT.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# yamllint disable
+end-of-epoch: true
+error: 0.0
+loss: 0.0
+unixtime: 1723354256.8637311

brain.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b3e2d4fc38492317f6e0fbed726028b0547525b2b81982af8620a014b0f4f55
+size 50

classifier.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e715f9889c050197540458f09e0f44ec87ff200b84b3aa3782411180919b4b0f
+size 1129762

counter.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a6562590ef19d1045d06c4055742d38288e9e6dcd71ccde5cee80f1d5a774eb
+size 2

custom_model.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+This file contains a very simple TDNN module to use for speaker-id.
+To replace this model, change the `!new:` tag in the hyperparameter file
+to refer to a built-in SpeechBrain model or another file containing
+a custom PyTorch module.
+Authors
+ * Nauman Dawalatabad 2020
+ * Mirco Ravanelli 2020
+"""
+import torch  # noqa: F401
+import torch.nn as nn
+import speechbrain as sb
+from speechbrain.nnet.CNN import Conv1d
+from speechbrain.nnet.linear import Linear
+from speechbrain.nnet.normalization import BatchNorm1d
+from speechbrain.nnet.pooling import StatisticsPooling
+class Xvector(torch.nn.Module):
+    """This model extracts X-vectors for speaker recognition
+    Arguments
+    ---------
+    device : str
+        The device to place this model on (e.g. "cpu" or "cuda")
+    activation : torch class
+        A class for constructing the activation layers.
+    tdnn_blocks : int
+        Number of time-delay neural (TDNN) layers.
+    tdnn_channels : list of ints
+        Output channels for TDNN layer.
+    tdnn_kernel_sizes : list of ints
+        List of kernel sizes for each TDNN layer.
+    tdnn_dilations : list of ints
+        List of dilations for kernels in each TDNN layer.
+    lin_neurons : int
+        Number of neurons in linear layers.
+    in_channels : int
+        Number of channels expected in the input.
+    Example
+    -------
+    >>> compute_xvect = Xvector()
+    >>> input_feats = torch.rand([5, 10, 40])
+    >>> outputs = compute_xvect(input_feats)
+    >>> outputs.shape
+    torch.Size([5, 1, 512])
+    """
+    def __init__(
+        self,
+        device="cpu",
+        activation=torch.nn.LeakyReLU,
+        tdnn_blocks=5,
+        tdnn_channels=[512, 512, 512, 512, 1500],
+        tdnn_kernel_sizes=[5, 3, 3, 1, 1],
+        tdnn_dilations=[1, 2, 3, 1, 1],
+        lin_neurons=512,
+        in_channels=40,
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        # TDNN has convolutional layers with the given dilation factors
+        # and kernel sizes. We here loop over all the convolutional layers
+        # that we wanna add. Note that batch normalization is used after
+        # the activations function in this case. This improves the
+        # speaker-id performance a bit.
+        for block_index in range(tdnn_blocks):
+            out_channels = tdnn_channels[block_index]
+            self.blocks.extend(
+                [
+                    Conv1d(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=tdnn_kernel_sizes[block_index],
+                        dilation=tdnn_dilations[block_index],
+                    ),
+                    activation(),
+                    BatchNorm1d(input_size=out_channels),
+                ]
+            )
+            in_channels = tdnn_channels[block_index]
+        # Statistical pooling. It converts a tensor of variable length
+        # into a fixed-length tensor. The statistical pooling returns the
+        # mean and the standard deviation.
+        self.blocks.append(StatisticsPooling())
+        # Final linear transformation.
+        self.blocks.append(
+            Linear(
+                input_size=out_channels * 2,  # mean + std,
+                n_neurons=lin_neurons,
+                bias=True,
+                combine_dims=False,
+            )
+        )
+    def forward(self, x, lens=None):
+        """Returns the x-vectors.
+        Arguments
+        ---------
+        x : torch.Tensor
+            The input features for computation.
+        lens : torch.Tensor
+            The length of the corresponding inputs.
+        Returns
+        -------
+        The computed x-vectors
+        """
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lens)
+            except TypeError:
+                x = layer(x)
+        return x
+class Classifier(sb.nnet.containers.Sequential):
+    """This class implements the last MLP on the top of xvector features.
+    Arguments
+    ---------
+    input_shape : tuple
+        Expected shape of an example input.
+    activation : torch class
+        A class for constructing the activation layers.
+    lin_blocks : int
+        Number of linear layers.
+    lin_neurons : int
+        Number of neurons in linear layers.
+    out_neurons : int
+        Number of output neurons.
+    Example
+    -------
+    >>> input_feats = torch.rand([5, 10, 40])
+    >>> compute_xvect = Xvector()
+    >>> xvects = compute_xvect(input_feats)
+    >>> classify = Classifier(input_shape=xvects.shape)
+    >>> output = classify(xvects)
+    >>> output.shape
+    torch.Size([5, 1, 1211])
+    """
+    def __init__(
+        self,
+        input_shape,
+        activation=torch.nn.LeakyReLU,
+        lin_blocks=1,
+        lin_neurons=512,
+        out_neurons=1211,
+    ):
+        super().__init__(input_shape=input_shape)
+        self.append(activation(), layer_name="act")
+        self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm")
+        if lin_blocks > 0:
+            self.append(sb.nnet.containers.Sequential, layer_name="DNN")
+        # Adding fully-connected layers
+        for block_index in range(lin_blocks):
+            block_name = f"block_{block_index}"
+            self.DNN.append(
+                sb.nnet.containers.Sequential, layer_name=block_name
+            )
+            self.DNN[block_name].append(
+                sb.nnet.linear.Linear,
+                n_neurons=lin_neurons,
+                bias=True,
+                layer_name="linear",
+            )
+            self.DNN[block_name].append(activation(), layer_name="act")
+            self.DNN[block_name].append(
+                sb.nnet.normalization.BatchNorm1d, layer_name="norm"
+            )
+        # Final Softmax classifier
+        self.append(
+            sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out"
+        )
+        self.append(
+            sb.nnet.activations.Softmax(apply_log=True), layer_name="softmax"
+        )

dataloader-TRAIN.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:188c1fdca79d927f6e812133173fc41d3a4e57074de521020274caa9bb29af7d
+size 3

embedding_model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fa25e49c7ef01eaaa3729d3747cc7e0c65488182adf6665a775325426290c7b
+size 16877295

hparams_inference.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+# #################################
+# Basic inference parameters for speaker-id. We have first a network that
+# computes some embeddings. On the top of that, we employ a classifier.
+#
+# Author:
+#  * Mirco Ravanelli 2021
+# #################################
+# pretrain folders:
+pretrained_path: best_model/
+# Model parameters
+n_mels: 23
+sample_rate: 16000
+n_classes: 28 # In this case, we have 28 speakers
+emb_dim: 512 # dimensionality of the embeddings
+# Feature extraction
+compute_features: !new:speechbrain.lobes.features.Fbank
+    n_mels: !ref <n_mels>
+# Mean and std normalization of the input features
+mean_var_norm: !new:speechbrain.processing.features.InputNormalization
+    norm_type: sentence
+    std_norm: False
+# To design a custom model, either just edit the simple CustomModel
+# class that's listed here, or replace this `!new` call with a line
+# pointing to a different file you've defined.
+embedding_model: !new:custom_model.Xvector
+    in_channels: !ref <n_mels>
+    activation: !name:torch.nn.LeakyReLU
+    tdnn_blocks: 5
+    tdnn_channels: [512, 512, 512, 512, 1500]
+    tdnn_kernel_sizes: [5, 3, 3, 1, 1]
+    tdnn_dilations: [1, 2, 3, 1, 1]
+    lin_neurons: !ref <emb_dim>
+classifier: !new:custom_model.Classifier
+    input_shape: [null, null, !ref <emb_dim>]
+    activation: !name:torch.nn.LeakyReLU
+    lin_blocks: 1
+    lin_neurons: !ref <emb_dim>
+    out_neurons: !ref <n_classes>
+label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
+# Objects in "modules" dict will have their parameters moved to the correct
+# device, as well as having train()/eval() called on them by the Brain class.
+modules:
+    compute_features: !ref <compute_features>
+    embedding_model: !ref <embedding_model>
+    classifier: !ref <classifier>
+    mean_var_norm: !ref <mean_var_norm>
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        embedding_model: !ref <embedding_model>
+        classifier: !ref <classifier>
+        label_encoder: !ref <label_encoder>
+    paths:
+        embedding_model: !ref <pretrained_path>/embedding_model.ckpt
+        classifier: !ref <pretrained_path>/classifier.ckpt
+        label_encoder: !ref <pretrained_path>/label_encoder.txt

label_encoder.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d79427b98a078b81f68aa4fdf2bd3f12fc9383513a6d444170ad5e2123ab7ff0
+size 49

label_encoder.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+'19' => 0
+================
+'starting_index' => 0

normalizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b98d47768bfba78eaa8a052f7f3e864308f5fff7e34051c8cb2adfef9f451948
+size 1578

optimizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9273e52743e2bb142bb2f89cdfa0fce5e509c3d0efaa42a34770b3a3fb8c653
+size 35929860