File size: 4,134 Bytes
b0587d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# ############################################################################
# Model: E2E ASR with attention-based ASR
# Encoder: CRDNN model
# Decoder: GRU + beamsearch
# Tokens: BPE with unigram
# Losses: CTC + NLL
# Training: Switchboard
# Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga,
# Samuele Cornell 2020, Dominik Wagner 2022
# ############################################################################

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40

# Model parameters
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (128, 256)
inter_layer_pooling_size: (2, 2)
cnn_kernelsize: (3, 3)
time_pooling_size: 4
rnn_class: !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 4
rnn_neurons: 1024
rnn_bidirectional: True
dnn_blocks: 2
dnn_neurons: 512
emb_size: 128
dec_neurons: 1024
output_neurons: 2000  # Number of tokens used for tokenizer
blank_index: 0
bos_index: 1
eos_index: 2

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_beam_size: 60
test_beam_size: 80
eos_threshold: 1.5
using_max_attn_shift: True
max_attn_shift: 240
ctc_weight_decode: 0.3
coverage_penalty: 1.8
temperature: 1.25

normalizer: !new:speechbrain.processing.features.InputNormalization
   norm_type: global

compute_features: !new:speechbrain.lobes.features.Fbank
   sample_rate: !ref <sample_rate>
   n_fft: !ref <n_fft>
   n_mels: !ref <n_mels>

enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
   input_shape: [null, null, !ref <n_mels>]
   activation: !ref <activation>
   dropout: !ref <dropout>
   cnn_blocks: !ref <cnn_blocks>
   cnn_channels: !ref <cnn_channels>
   cnn_kernelsize: !ref <cnn_kernelsize>
   inter_layer_pooling_size: !ref <inter_layer_pooling_size>
   time_pooling: True
   using_2d_pooling: False
   time_pooling_size: !ref <time_pooling_size>
   rnn_class: !ref <rnn_class>
   rnn_layers: !ref <rnn_layers>
   rnn_neurons: !ref <rnn_neurons>
   rnn_bidirectional: !ref <rnn_bidirectional>
   rnn_re_init: True
   dnn_blocks: !ref <dnn_blocks>
   dnn_neurons: !ref <dnn_neurons>
   use_rnnp: False

emb: !new:speechbrain.nnet.embedding.Embedding
   num_embeddings: !ref <output_neurons>
   embedding_dim: !ref <emb_size>

dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
   enc_dim: !ref <dnn_neurons>
   input_size: !ref <emb_size>
   rnn_type: gru
   attn_type: location
   hidden_size: !ref <dec_neurons>
   attn_dim: 1024
   num_layers: 1
   scaling: 1.0
   channels: 10
   kernel_size: 100
   re_init: True
   dropout: !ref <dropout>

ctc_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <dnn_neurons>
   n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <dec_neurons>
   n_neurons: !ref <output_neurons>

log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: True

tokenizer: !new:sentencepiece.SentencePieceProcessor

asr_model: !new:torch.nn.ModuleList
   - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    input_shape: [null, null, !ref <n_mels>]
    compute_features: !ref <compute_features>
    normalize: !ref <normalizer>
    model: !ref <enc>

decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher
   embedding: !ref <emb>
   decoder: !ref <dec>
   linear: !ref <seq_lin>
   ctc_linear: !ref <ctc_lin>
   bos_index: !ref <bos_index>
   eos_index: !ref <eos_index>
   blank_index: !ref <blank_index>
   min_decode_ratio: !ref <min_decode_ratio>
   max_decode_ratio: !ref <max_decode_ratio>
   beam_size: !ref <test_beam_size>
   eos_threshold: !ref <eos_threshold>
   using_max_attn_shift: !ref <using_max_attn_shift>
   max_attn_shift: !ref <max_attn_shift>
   coverage_penalty: !ref <coverage_penalty>
   ctc_weight: !ref <ctc_weight_decode>
   temperature: !ref <temperature>

modules:
    normalizer: !ref <normalizer>
    encoder: !ref <encoder>
    decoder: !ref <decoder>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
      normalizer: !ref <normalizer>
      asr: !ref <asr_model>
      tokenizer: !ref <tokenizer>