Upload 12 files
Browse files- CKPT.yaml +5 -0
- brain.ckpt +3 -0
- classifier.ckpt +3 -0
- counter.ckpt +3 -0
- custom_model.py +192 -0
- dataloader-TRAIN.ckpt +3 -0
- embedding_model.ckpt +3 -0
- hparams_inference.yaml +66 -0
- label_encoder.ckpt +3 -0
- label_encoder.txt +3 -0
- normalizer.ckpt +3 -0
- optimizer.ckpt +3 -0
CKPT.yaml
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# yamllint disable
|
2 |
+
end-of-epoch: true
|
3 |
+
error: 0.0
|
4 |
+
loss: 0.0
|
5 |
+
unixtime: 1723354256.8637311
|
brain.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b3e2d4fc38492317f6e0fbed726028b0547525b2b81982af8620a014b0f4f55
|
3 |
+
size 50
|
classifier.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e715f9889c050197540458f09e0f44ec87ff200b84b3aa3782411180919b4b0f
|
3 |
+
size 1129762
|
counter.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a6562590ef19d1045d06c4055742d38288e9e6dcd71ccde5cee80f1d5a774eb
|
3 |
+
size 2
|
custom_model.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file contains a very simple TDNN module to use for speaker-id.
|
3 |
+
|
4 |
+
To replace this model, change the `!new:` tag in the hyperparameter file
|
5 |
+
to refer to a built-in SpeechBrain model or another file containing
|
6 |
+
a custom PyTorch module.
|
7 |
+
|
8 |
+
Authors
|
9 |
+
* Nauman Dawalatabad 2020
|
10 |
+
* Mirco Ravanelli 2020
|
11 |
+
"""
|
12 |
+
|
13 |
+
import torch # noqa: F401
|
14 |
+
import torch.nn as nn
|
15 |
+
|
16 |
+
import speechbrain as sb
|
17 |
+
from speechbrain.nnet.CNN import Conv1d
|
18 |
+
from speechbrain.nnet.linear import Linear
|
19 |
+
from speechbrain.nnet.normalization import BatchNorm1d
|
20 |
+
from speechbrain.nnet.pooling import StatisticsPooling
|
21 |
+
|
22 |
+
|
23 |
+
class Xvector(torch.nn.Module):
|
24 |
+
"""This model extracts X-vectors for speaker recognition
|
25 |
+
|
26 |
+
Arguments
|
27 |
+
---------
|
28 |
+
device : str
|
29 |
+
The device to place this model on (e.g. "cpu" or "cuda")
|
30 |
+
activation : torch class
|
31 |
+
A class for constructing the activation layers.
|
32 |
+
tdnn_blocks : int
|
33 |
+
Number of time-delay neural (TDNN) layers.
|
34 |
+
tdnn_channels : list of ints
|
35 |
+
Output channels for TDNN layer.
|
36 |
+
tdnn_kernel_sizes : list of ints
|
37 |
+
List of kernel sizes for each TDNN layer.
|
38 |
+
tdnn_dilations : list of ints
|
39 |
+
List of dilations for kernels in each TDNN layer.
|
40 |
+
lin_neurons : int
|
41 |
+
Number of neurons in linear layers.
|
42 |
+
in_channels : int
|
43 |
+
Number of channels expected in the input.
|
44 |
+
|
45 |
+
Example
|
46 |
+
-------
|
47 |
+
>>> compute_xvect = Xvector()
|
48 |
+
>>> input_feats = torch.rand([5, 10, 40])
|
49 |
+
>>> outputs = compute_xvect(input_feats)
|
50 |
+
>>> outputs.shape
|
51 |
+
torch.Size([5, 1, 512])
|
52 |
+
"""
|
53 |
+
|
54 |
+
def __init__(
|
55 |
+
self,
|
56 |
+
device="cpu",
|
57 |
+
activation=torch.nn.LeakyReLU,
|
58 |
+
tdnn_blocks=5,
|
59 |
+
tdnn_channels=[512, 512, 512, 512, 1500],
|
60 |
+
tdnn_kernel_sizes=[5, 3, 3, 1, 1],
|
61 |
+
tdnn_dilations=[1, 2, 3, 1, 1],
|
62 |
+
lin_neurons=512,
|
63 |
+
in_channels=40,
|
64 |
+
):
|
65 |
+
|
66 |
+
super().__init__()
|
67 |
+
self.blocks = nn.ModuleList()
|
68 |
+
|
69 |
+
# TDNN has convolutional layers with the given dilation factors
|
70 |
+
# and kernel sizes. We here loop over all the convolutional layers
|
71 |
+
# that we wanna add. Note that batch normalization is used after
|
72 |
+
# the activations function in this case. This improves the
|
73 |
+
# speaker-id performance a bit.
|
74 |
+
for block_index in range(tdnn_blocks):
|
75 |
+
out_channels = tdnn_channels[block_index]
|
76 |
+
self.blocks.extend(
|
77 |
+
[
|
78 |
+
Conv1d(
|
79 |
+
in_channels=in_channels,
|
80 |
+
out_channels=out_channels,
|
81 |
+
kernel_size=tdnn_kernel_sizes[block_index],
|
82 |
+
dilation=tdnn_dilations[block_index],
|
83 |
+
),
|
84 |
+
activation(),
|
85 |
+
BatchNorm1d(input_size=out_channels),
|
86 |
+
]
|
87 |
+
)
|
88 |
+
in_channels = tdnn_channels[block_index]
|
89 |
+
|
90 |
+
# Statistical pooling. It converts a tensor of variable length
|
91 |
+
# into a fixed-length tensor. The statistical pooling returns the
|
92 |
+
# mean and the standard deviation.
|
93 |
+
self.blocks.append(StatisticsPooling())
|
94 |
+
|
95 |
+
# Final linear transformation.
|
96 |
+
self.blocks.append(
|
97 |
+
Linear(
|
98 |
+
input_size=out_channels * 2, # mean + std,
|
99 |
+
n_neurons=lin_neurons,
|
100 |
+
bias=True,
|
101 |
+
combine_dims=False,
|
102 |
+
)
|
103 |
+
)
|
104 |
+
|
105 |
+
def forward(self, x, lens=None):
|
106 |
+
"""Returns the x-vectors.
|
107 |
+
|
108 |
+
Arguments
|
109 |
+
---------
|
110 |
+
x : torch.Tensor
|
111 |
+
The input features for computation.
|
112 |
+
lens : torch.Tensor
|
113 |
+
The length of the corresponding inputs.
|
114 |
+
|
115 |
+
Returns
|
116 |
+
-------
|
117 |
+
The computed x-vectors
|
118 |
+
"""
|
119 |
+
for layer in self.blocks:
|
120 |
+
try:
|
121 |
+
x = layer(x, lengths=lens)
|
122 |
+
except TypeError:
|
123 |
+
x = layer(x)
|
124 |
+
return x
|
125 |
+
|
126 |
+
|
127 |
+
class Classifier(sb.nnet.containers.Sequential):
|
128 |
+
"""This class implements the last MLP on the top of xvector features.
|
129 |
+
Arguments
|
130 |
+
---------
|
131 |
+
input_shape : tuple
|
132 |
+
Expected shape of an example input.
|
133 |
+
activation : torch class
|
134 |
+
A class for constructing the activation layers.
|
135 |
+
lin_blocks : int
|
136 |
+
Number of linear layers.
|
137 |
+
lin_neurons : int
|
138 |
+
Number of neurons in linear layers.
|
139 |
+
out_neurons : int
|
140 |
+
Number of output neurons.
|
141 |
+
|
142 |
+
Example
|
143 |
+
-------
|
144 |
+
>>> input_feats = torch.rand([5, 10, 40])
|
145 |
+
>>> compute_xvect = Xvector()
|
146 |
+
>>> xvects = compute_xvect(input_feats)
|
147 |
+
>>> classify = Classifier(input_shape=xvects.shape)
|
148 |
+
>>> output = classify(xvects)
|
149 |
+
>>> output.shape
|
150 |
+
torch.Size([5, 1, 1211])
|
151 |
+
"""
|
152 |
+
|
153 |
+
def __init__(
|
154 |
+
self,
|
155 |
+
input_shape,
|
156 |
+
activation=torch.nn.LeakyReLU,
|
157 |
+
lin_blocks=1,
|
158 |
+
lin_neurons=512,
|
159 |
+
out_neurons=1211,
|
160 |
+
):
|
161 |
+
super().__init__(input_shape=input_shape)
|
162 |
+
|
163 |
+
self.append(activation(), layer_name="act")
|
164 |
+
self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm")
|
165 |
+
|
166 |
+
if lin_blocks > 0:
|
167 |
+
self.append(sb.nnet.containers.Sequential, layer_name="DNN")
|
168 |
+
|
169 |
+
# Adding fully-connected layers
|
170 |
+
for block_index in range(lin_blocks):
|
171 |
+
block_name = f"block_{block_index}"
|
172 |
+
self.DNN.append(
|
173 |
+
sb.nnet.containers.Sequential, layer_name=block_name
|
174 |
+
)
|
175 |
+
self.DNN[block_name].append(
|
176 |
+
sb.nnet.linear.Linear,
|
177 |
+
n_neurons=lin_neurons,
|
178 |
+
bias=True,
|
179 |
+
layer_name="linear",
|
180 |
+
)
|
181 |
+
self.DNN[block_name].append(activation(), layer_name="act")
|
182 |
+
self.DNN[block_name].append(
|
183 |
+
sb.nnet.normalization.BatchNorm1d, layer_name="norm"
|
184 |
+
)
|
185 |
+
|
186 |
+
# Final Softmax classifier
|
187 |
+
self.append(
|
188 |
+
sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out"
|
189 |
+
)
|
190 |
+
self.append(
|
191 |
+
sb.nnet.activations.Softmax(apply_log=True), layer_name="softmax"
|
192 |
+
)
|
dataloader-TRAIN.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:188c1fdca79d927f6e812133173fc41d3a4e57074de521020274caa9bb29af7d
|
3 |
+
size 3
|
embedding_model.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7fa25e49c7ef01eaaa3729d3747cc7e0c65488182adf6665a775325426290c7b
|
3 |
+
size 16877295
|
hparams_inference.yaml
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# #################################
|
3 |
+
# Basic inference parameters for speaker-id. We have first a network that
|
4 |
+
# computes some embeddings. On the top of that, we employ a classifier.
|
5 |
+
#
|
6 |
+
# Author:
|
7 |
+
# * Mirco Ravanelli 2021
|
8 |
+
# #################################
|
9 |
+
|
10 |
+
# pretrain folders:
|
11 |
+
pretrained_path: best_model/
|
12 |
+
|
13 |
+
|
14 |
+
# Model parameters
|
15 |
+
n_mels: 23
|
16 |
+
sample_rate: 16000
|
17 |
+
n_classes: 28 # In this case, we have 28 speakers
|
18 |
+
emb_dim: 512 # dimensionality of the embeddings
|
19 |
+
|
20 |
+
# Feature extraction
|
21 |
+
compute_features: !new:speechbrain.lobes.features.Fbank
|
22 |
+
n_mels: !ref <n_mels>
|
23 |
+
|
24 |
+
# Mean and std normalization of the input features
|
25 |
+
mean_var_norm: !new:speechbrain.processing.features.InputNormalization
|
26 |
+
norm_type: sentence
|
27 |
+
std_norm: False
|
28 |
+
|
29 |
+
# To design a custom model, either just edit the simple CustomModel
|
30 |
+
# class that's listed here, or replace this `!new` call with a line
|
31 |
+
# pointing to a different file you've defined.
|
32 |
+
embedding_model: !new:custom_model.Xvector
|
33 |
+
in_channels: !ref <n_mels>
|
34 |
+
activation: !name:torch.nn.LeakyReLU
|
35 |
+
tdnn_blocks: 5
|
36 |
+
tdnn_channels: [512, 512, 512, 512, 1500]
|
37 |
+
tdnn_kernel_sizes: [5, 3, 3, 1, 1]
|
38 |
+
tdnn_dilations: [1, 2, 3, 1, 1]
|
39 |
+
lin_neurons: !ref <emb_dim>
|
40 |
+
|
41 |
+
classifier: !new:custom_model.Classifier
|
42 |
+
input_shape: [null, null, !ref <emb_dim>]
|
43 |
+
activation: !name:torch.nn.LeakyReLU
|
44 |
+
lin_blocks: 1
|
45 |
+
lin_neurons: !ref <emb_dim>
|
46 |
+
out_neurons: !ref <n_classes>
|
47 |
+
|
48 |
+
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
|
49 |
+
|
50 |
+
# Objects in "modules" dict will have their parameters moved to the correct
|
51 |
+
# device, as well as having train()/eval() called on them by the Brain class.
|
52 |
+
modules:
|
53 |
+
compute_features: !ref <compute_features>
|
54 |
+
embedding_model: !ref <embedding_model>
|
55 |
+
classifier: !ref <classifier>
|
56 |
+
mean_var_norm: !ref <mean_var_norm>
|
57 |
+
|
58 |
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
59 |
+
loadables:
|
60 |
+
embedding_model: !ref <embedding_model>
|
61 |
+
classifier: !ref <classifier>
|
62 |
+
label_encoder: !ref <label_encoder>
|
63 |
+
paths:
|
64 |
+
embedding_model: !ref <pretrained_path>/embedding_model.ckpt
|
65 |
+
classifier: !ref <pretrained_path>/classifier.ckpt
|
66 |
+
label_encoder: !ref <pretrained_path>/label_encoder.txt
|
label_encoder.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d79427b98a078b81f68aa4fdf2bd3f12fc9383513a6d444170ad5e2123ab7ff0
|
3 |
+
size 49
|
label_encoder.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
'19' => 0
|
2 |
+
================
|
3 |
+
'starting_index' => 0
|
normalizer.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b98d47768bfba78eaa8a052f7f3e864308f5fff7e34051c8cb2adfef9f451948
|
3 |
+
size 1578
|
optimizer.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9273e52743e2bb142bb2f89cdfa0fce5e509c3d0efaa42a34770b3a3fb8c653
|
3 |
+
size 35929860
|