jefson08 commited on
Commit
2eb2aec
·
verified ·
1 Parent(s): ccfb195

Upload 12 files

Browse files
CKPT.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # yamllint disable
2
+ end-of-epoch: true
3
+ error: 0.0
4
+ loss: 0.0
5
+ unixtime: 1723354256.8637311
brain.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b3e2d4fc38492317f6e0fbed726028b0547525b2b81982af8620a014b0f4f55
3
+ size 50
classifier.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e715f9889c050197540458f09e0f44ec87ff200b84b3aa3782411180919b4b0f
3
+ size 1129762
counter.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a6562590ef19d1045d06c4055742d38288e9e6dcd71ccde5cee80f1d5a774eb
3
+ size 2
custom_model.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file contains a very simple TDNN module to use for speaker-id.
3
+
4
+ To replace this model, change the `!new:` tag in the hyperparameter file
5
+ to refer to a built-in SpeechBrain model or another file containing
6
+ a custom PyTorch module.
7
+
8
+ Authors
9
+ * Nauman Dawalatabad 2020
10
+ * Mirco Ravanelli 2020
11
+ """
12
+
13
+ import torch # noqa: F401
14
+ import torch.nn as nn
15
+
16
+ import speechbrain as sb
17
+ from speechbrain.nnet.CNN import Conv1d
18
+ from speechbrain.nnet.linear import Linear
19
+ from speechbrain.nnet.normalization import BatchNorm1d
20
+ from speechbrain.nnet.pooling import StatisticsPooling
21
+
22
+
23
+ class Xvector(torch.nn.Module):
24
+ """This model extracts X-vectors for speaker recognition
25
+
26
+ Arguments
27
+ ---------
28
+ device : str
29
+ The device to place this model on (e.g. "cpu" or "cuda")
30
+ activation : torch class
31
+ A class for constructing the activation layers.
32
+ tdnn_blocks : int
33
+ Number of time-delay neural (TDNN) layers.
34
+ tdnn_channels : list of ints
35
+ Output channels for TDNN layer.
36
+ tdnn_kernel_sizes : list of ints
37
+ List of kernel sizes for each TDNN layer.
38
+ tdnn_dilations : list of ints
39
+ List of dilations for kernels in each TDNN layer.
40
+ lin_neurons : int
41
+ Number of neurons in linear layers.
42
+ in_channels : int
43
+ Number of channels expected in the input.
44
+
45
+ Example
46
+ -------
47
+ >>> compute_xvect = Xvector()
48
+ >>> input_feats = torch.rand([5, 10, 40])
49
+ >>> outputs = compute_xvect(input_feats)
50
+ >>> outputs.shape
51
+ torch.Size([5, 1, 512])
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ device="cpu",
57
+ activation=torch.nn.LeakyReLU,
58
+ tdnn_blocks=5,
59
+ tdnn_channels=[512, 512, 512, 512, 1500],
60
+ tdnn_kernel_sizes=[5, 3, 3, 1, 1],
61
+ tdnn_dilations=[1, 2, 3, 1, 1],
62
+ lin_neurons=512,
63
+ in_channels=40,
64
+ ):
65
+
66
+ super().__init__()
67
+ self.blocks = nn.ModuleList()
68
+
69
+ # TDNN has convolutional layers with the given dilation factors
70
+ # and kernel sizes. We here loop over all the convolutional layers
71
+ # that we wanna add. Note that batch normalization is used after
72
+ # the activations function in this case. This improves the
73
+ # speaker-id performance a bit.
74
+ for block_index in range(tdnn_blocks):
75
+ out_channels = tdnn_channels[block_index]
76
+ self.blocks.extend(
77
+ [
78
+ Conv1d(
79
+ in_channels=in_channels,
80
+ out_channels=out_channels,
81
+ kernel_size=tdnn_kernel_sizes[block_index],
82
+ dilation=tdnn_dilations[block_index],
83
+ ),
84
+ activation(),
85
+ BatchNorm1d(input_size=out_channels),
86
+ ]
87
+ )
88
+ in_channels = tdnn_channels[block_index]
89
+
90
+ # Statistical pooling. It converts a tensor of variable length
91
+ # into a fixed-length tensor. The statistical pooling returns the
92
+ # mean and the standard deviation.
93
+ self.blocks.append(StatisticsPooling())
94
+
95
+ # Final linear transformation.
96
+ self.blocks.append(
97
+ Linear(
98
+ input_size=out_channels * 2, # mean + std,
99
+ n_neurons=lin_neurons,
100
+ bias=True,
101
+ combine_dims=False,
102
+ )
103
+ )
104
+
105
+ def forward(self, x, lens=None):
106
+ """Returns the x-vectors.
107
+
108
+ Arguments
109
+ ---------
110
+ x : torch.Tensor
111
+ The input features for computation.
112
+ lens : torch.Tensor
113
+ The length of the corresponding inputs.
114
+
115
+ Returns
116
+ -------
117
+ The computed x-vectors
118
+ """
119
+ for layer in self.blocks:
120
+ try:
121
+ x = layer(x, lengths=lens)
122
+ except TypeError:
123
+ x = layer(x)
124
+ return x
125
+
126
+
127
+ class Classifier(sb.nnet.containers.Sequential):
128
+ """This class implements the last MLP on the top of xvector features.
129
+ Arguments
130
+ ---------
131
+ input_shape : tuple
132
+ Expected shape of an example input.
133
+ activation : torch class
134
+ A class for constructing the activation layers.
135
+ lin_blocks : int
136
+ Number of linear layers.
137
+ lin_neurons : int
138
+ Number of neurons in linear layers.
139
+ out_neurons : int
140
+ Number of output neurons.
141
+
142
+ Example
143
+ -------
144
+ >>> input_feats = torch.rand([5, 10, 40])
145
+ >>> compute_xvect = Xvector()
146
+ >>> xvects = compute_xvect(input_feats)
147
+ >>> classify = Classifier(input_shape=xvects.shape)
148
+ >>> output = classify(xvects)
149
+ >>> output.shape
150
+ torch.Size([5, 1, 1211])
151
+ """
152
+
153
+ def __init__(
154
+ self,
155
+ input_shape,
156
+ activation=torch.nn.LeakyReLU,
157
+ lin_blocks=1,
158
+ lin_neurons=512,
159
+ out_neurons=1211,
160
+ ):
161
+ super().__init__(input_shape=input_shape)
162
+
163
+ self.append(activation(), layer_name="act")
164
+ self.append(sb.nnet.normalization.BatchNorm1d, layer_name="norm")
165
+
166
+ if lin_blocks > 0:
167
+ self.append(sb.nnet.containers.Sequential, layer_name="DNN")
168
+
169
+ # Adding fully-connected layers
170
+ for block_index in range(lin_blocks):
171
+ block_name = f"block_{block_index}"
172
+ self.DNN.append(
173
+ sb.nnet.containers.Sequential, layer_name=block_name
174
+ )
175
+ self.DNN[block_name].append(
176
+ sb.nnet.linear.Linear,
177
+ n_neurons=lin_neurons,
178
+ bias=True,
179
+ layer_name="linear",
180
+ )
181
+ self.DNN[block_name].append(activation(), layer_name="act")
182
+ self.DNN[block_name].append(
183
+ sb.nnet.normalization.BatchNorm1d, layer_name="norm"
184
+ )
185
+
186
+ # Final Softmax classifier
187
+ self.append(
188
+ sb.nnet.linear.Linear, n_neurons=out_neurons, layer_name="out"
189
+ )
190
+ self.append(
191
+ sb.nnet.activations.Softmax(apply_log=True), layer_name="softmax"
192
+ )
dataloader-TRAIN.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:188c1fdca79d927f6e812133173fc41d3a4e57074de521020274caa9bb29af7d
3
+ size 3
embedding_model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa25e49c7ef01eaaa3729d3747cc7e0c65488182adf6665a775325426290c7b
3
+ size 16877295
hparams_inference.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # #################################
3
+ # Basic inference parameters for speaker-id. We have first a network that
4
+ # computes some embeddings. On the top of that, we employ a classifier.
5
+ #
6
+ # Author:
7
+ # * Mirco Ravanelli 2021
8
+ # #################################
9
+
10
+ # pretrain folders:
11
+ pretrained_path: best_model/
12
+
13
+
14
+ # Model parameters
15
+ n_mels: 23
16
+ sample_rate: 16000
17
+ n_classes: 28 # In this case, we have 28 speakers
18
+ emb_dim: 512 # dimensionality of the embeddings
19
+
20
+ # Feature extraction
21
+ compute_features: !new:speechbrain.lobes.features.Fbank
22
+ n_mels: !ref <n_mels>
23
+
24
+ # Mean and std normalization of the input features
25
+ mean_var_norm: !new:speechbrain.processing.features.InputNormalization
26
+ norm_type: sentence
27
+ std_norm: False
28
+
29
+ # To design a custom model, either just edit the simple CustomModel
30
+ # class that's listed here, or replace this `!new` call with a line
31
+ # pointing to a different file you've defined.
32
+ embedding_model: !new:custom_model.Xvector
33
+ in_channels: !ref <n_mels>
34
+ activation: !name:torch.nn.LeakyReLU
35
+ tdnn_blocks: 5
36
+ tdnn_channels: [512, 512, 512, 512, 1500]
37
+ tdnn_kernel_sizes: [5, 3, 3, 1, 1]
38
+ tdnn_dilations: [1, 2, 3, 1, 1]
39
+ lin_neurons: !ref <emb_dim>
40
+
41
+ classifier: !new:custom_model.Classifier
42
+ input_shape: [null, null, !ref <emb_dim>]
43
+ activation: !name:torch.nn.LeakyReLU
44
+ lin_blocks: 1
45
+ lin_neurons: !ref <emb_dim>
46
+ out_neurons: !ref <n_classes>
47
+
48
+ label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
49
+
50
+ # Objects in "modules" dict will have their parameters moved to the correct
51
+ # device, as well as having train()/eval() called on them by the Brain class.
52
+ modules:
53
+ compute_features: !ref <compute_features>
54
+ embedding_model: !ref <embedding_model>
55
+ classifier: !ref <classifier>
56
+ mean_var_norm: !ref <mean_var_norm>
57
+
58
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
59
+ loadables:
60
+ embedding_model: !ref <embedding_model>
61
+ classifier: !ref <classifier>
62
+ label_encoder: !ref <label_encoder>
63
+ paths:
64
+ embedding_model: !ref <pretrained_path>/embedding_model.ckpt
65
+ classifier: !ref <pretrained_path>/classifier.ckpt
66
+ label_encoder: !ref <pretrained_path>/label_encoder.txt
label_encoder.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79427b98a078b81f68aa4fdf2bd3f12fc9383513a6d444170ad5e2123ab7ff0
3
+ size 49
label_encoder.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ '19' => 0
2
+ ================
3
+ 'starting_index' => 0
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b98d47768bfba78eaa8a052f7f3e864308f5fff7e34051c8cb2adfef9f451948
3
+ size 1578
optimizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9273e52743e2bb142bb2f89cdfa0fce5e509c3d0efaa42a34770b3a3fb8c653
3
+ size 35929860