poonehmousavi commited on
Commit
81569b0
1 Parent(s): 70b2a7b

Update hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +76 -90
hyperparams.yaml CHANGED
@@ -1,5 +1,5 @@
1
  # ################################
2
- # Model: Transducer ASR
3
  # Augmentation: SpecAugment
4
  # Authors: Pooneh Mousavi 2023
5
  # ################################
@@ -9,41 +9,32 @@ n_fft: 400
9
  n_mels: 80
10
 
11
  # Model parameters
12
- activation: !name:torch.nn.LeakyReLU
13
- dropout: 0.15
14
- cnn_blocks: 3
15
- cnn_channels: (128, 200, 256)
16
- inter_layer_pooling_size: (2, 2, 2)
17
- cnn_kernelsize: (3, 3)
18
- time_pooling_size: 4
19
- rnn_class: !name:speechbrain.nnet.RNN.LSTM
20
- rnn_layers: 5
21
- rnn_neurons: 1024
22
- rnn_bidirectional: True
23
- dnn_blocks: 2
24
- dnn_neurons: 1024
25
- dec_neurons: 1024
26
- joint_dim: 1024
27
 
28
  # Outputs
29
- output_neurons: 1000 # BPE size, index(blank/eos/bos) = 0
30
- # transducer_beam_search : True
31
- # Decoding parameters
32
- # Be sure that the bos and eos index match with the BPEs ones
33
  blank_index: 0
34
- bos_index: 0
35
- eos_index: 0
 
 
36
 
 
37
  min_decode_ratio: 0.0
38
  max_decode_ratio: 1.0
39
- beam_size: 4
40
- nbest: 1
41
- # by default {state,expand}_beam = 2.3 as mention in paper
42
- # https://arxiv.org/abs/1904.02619
43
- state_beam: 2.3
44
- expand_beam: 2.3
45
- transducer_beam_search: True
46
-
47
 
48
  normalizer: !new:speechbrain.processing.features.InputNormalization
49
  norm_type: global
@@ -53,61 +44,53 @@ compute_features: !new:speechbrain.lobes.features.Fbank
53
  n_fft: !ref <n_fft>
54
  n_mels: !ref <n_mels>
55
 
56
- enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
57
- input_shape: [null, null, !ref <n_mels>]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  activation: !ref <activation>
59
- dropout: !ref <dropout>
60
- cnn_blocks: !ref <cnn_blocks>
61
- cnn_channels: !ref <cnn_channels>
62
- cnn_kernelsize: !ref <cnn_kernelsize>
63
- inter_layer_pooling_size: !ref <inter_layer_pooling_size>
64
- time_pooling: True
65
- using_2d_pooling: False
66
- time_pooling_size: !ref <time_pooling_size>
67
- rnn_class: !ref <rnn_class>
68
- rnn_layers: !ref <rnn_layers>
69
- rnn_neurons: !ref <rnn_neurons>
70
- rnn_bidirectional: !ref <rnn_bidirectional>
71
- rnn_re_init: True
72
- dnn_blocks: !ref <dnn_blocks>
73
- dnn_neurons: !ref <dnn_neurons>
74
-
75
- enc_lin: !new:speechbrain.nnet.linear.Linear
76
- input_size: !ref <dnn_neurons>
77
- n_neurons: !ref <joint_dim>
78
-
79
- emb: !new:speechbrain.nnet.embedding.Embedding
80
- num_embeddings: !ref <output_neurons>
81
- consider_as_one_hot: True
82
- blank_id: !ref <blank_index>
83
-
84
- dec: !new:speechbrain.nnet.RNN.GRU
85
- input_shape: [null, null, !ref <output_neurons> - 1]
86
- hidden_size: !ref <dec_neurons>
87
- num_layers: 1
88
- re_init: True
89
-
90
- # For MTL with LM over the decoder
91
- dec_lin: !new:speechbrain.nnet.linear.Linear
92
- input_size: !ref <dec_neurons>
93
- n_neurons: !ref <joint_dim>
94
- bias: False
95
-
96
- Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
97
- joint: sum # joint [sum | concat]
98
- nonlinearity: !ref <activation>
99
-
100
- transducer_lin: !new:speechbrain.nnet.linear.Linear
101
- input_size: !ref <joint_dim>
102
- n_neurons: !ref <output_neurons>
103
- bias: False
104
 
105
  log_softmax: !new:speechbrain.nnet.activations.Softmax
106
  apply_log: True
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  asr_model: !new:torch.nn.ModuleList
109
- - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <transducer_lin>]
110
-
111
 
112
 
113
  tokenizer: !new:sentencepiece.SentencePieceProcessor
@@ -116,21 +99,24 @@ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
116
  input_shape: [null, null, !ref <n_mels>]
117
  compute_features: !ref <compute_features>
118
  normalize: !ref <normalizer>
119
- model: !ref <enc>
120
-
121
- decoder: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
122
- decode_network_lst: [!ref <emb>, !ref <dec>]
123
- tjoint: !ref <Tjoint>
124
- classifier_network: [!ref <transducer_lin>]
125
- blank_id: !ref <blank_index>
126
- beam_size: !ref <beam_size>
127
- nbest: !ref <nbest>
128
- state_beam: !ref <state_beam>
129
- expand_beam: !ref <expand_beam>
 
 
130
 
131
  modules:
132
  normalizer: !ref <normalizer>
133
  encoder: !ref <encoder>
 
134
  decoder: !ref <decoder>
135
 
136
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
 
1
  # ################################
2
+ # Model: Transformer ASR
3
  # Augmentation: SpecAugment
4
  # Authors: Pooneh Mousavi 2023
5
  # ################################
 
9
  n_mels: 80
10
 
11
  # Model parameters
12
+ # Transformer
13
+ d_model: 768
14
+ nhead: 8
15
+ num_encoder_layers: 12
16
+ num_decoder_layers: 6
17
+ d_ffn: 3072
18
+ transformer_dropout: 0.0
19
+ activation: !name:torch.nn.GELU
20
+ output_neurons: 500
 
 
 
 
 
 
21
 
22
  # Outputs
 
 
 
 
23
  blank_index: 0
24
+ label_smoothing: 0.1
25
+ pad_index: 0
26
+ bos_index: 1
27
+ eos_index: 2
28
 
29
+ # Decoding parameters
30
  min_decode_ratio: 0.0
31
  max_decode_ratio: 1.0
32
+ valid_search_interval: 5
33
+ valid_beam_size: 10
34
+ test_beam_size: 80
35
+ ctc_weight_decode: 0.3
36
+ scorer_beam_scale: 0.3
37
+ transformer_beam_search: True
 
 
38
 
39
  normalizer: !new:speechbrain.processing.features.InputNormalization
40
  norm_type: global
 
44
  n_fft: !ref <n_fft>
45
  n_mels: !ref <n_mels>
46
 
47
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
48
+ input_shape: (8, 10, 80)
49
+ num_blocks: 3
50
+ num_layers_per_block: 1
51
+ out_channels: (128, 200, 256)
52
+ kernel_sizes: (3, 3, 1)
53
+ strides: (2, 2, 1)
54
+ residuals: (False, False, False)
55
+
56
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
57
+ input_size: 5120
58
+ tgt_vocab: !ref <output_neurons>
59
+ d_model: !ref <d_model>
60
+ nhead: !ref <nhead>
61
+ num_encoder_layers: !ref <num_encoder_layers>
62
+ num_decoder_layers: !ref <num_decoder_layers>
63
+ d_ffn: !ref <d_ffn>
64
+ dropout: !ref <transformer_dropout>
65
  activation: !ref <activation>
66
+ normalize_before: False
67
+
68
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
69
+ input_size: !ref <d_model>
70
+ n_neurons: !ref <output_neurons>
71
+
72
+ seq_lin: !new:speechbrain.nnet.linear.Linear
73
+ input_size: !ref <d_model>
74
+ n_neurons: !ref <output_neurons>
75
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  log_softmax: !new:speechbrain.nnet.activations.Softmax
78
  apply_log: True
79
+
80
+ # Scorer
81
+ ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
82
+ eos_index: !ref <eos_index>
83
+ blank_index: !ref <blank_index>
84
+ ctc_fc: !ref <ctc_lin>
85
+
86
+ scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
87
+ full_scorers: [!ref <ctc_scorer>]
88
+ weights:
89
+ ctc: !ref <ctc_weight_decode>
90
+ scorer_beam_scale: !ref <scorer_beam_scale>
91
 
92
  asr_model: !new:torch.nn.ModuleList
93
+ - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
 
94
 
95
 
96
  tokenizer: !new:sentencepiece.SentencePieceProcessor
 
99
  input_shape: [null, null, !ref <n_mels>]
100
  compute_features: !ref <compute_features>
101
  normalize: !ref <normalizer>
102
+ CNN: !ref <CNN>
103
+
104
+
105
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
106
+ modules: [!ref <Transformer>, !ref <seq_lin>]
107
+ bos_index: !ref <bos_index>
108
+ eos_index: !ref <eos_index>
109
+ min_decode_ratio: !ref <min_decode_ratio>
110
+ max_decode_ratio: !ref <max_decode_ratio>
111
+ beam_size: !ref <test_beam_size>
112
+ temperature: 1.15
113
+ using_eos_threshold: True
114
+ scorer: !ref <scorer>
115
 
116
  modules:
117
  normalizer: !ref <normalizer>
118
  encoder: !ref <encoder>
119
+ transformer: !ref <Transformer>
120
  decoder: !ref <decoder>
121
 
122
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer