File size: 4,134 Bytes
b0587d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# ############################################################################
# Model: E2E ASR with attention-based ASR
# Encoder: CRDNN model
# Decoder: GRU + beamsearch
# Tokens: BPE with unigram
# Losses: CTC + NLL
# Training: Switchboard
# Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga,
# Samuele Cornell 2020, Dominik Wagner 2022
# ############################################################################
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40
# Model parameters
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (128, 256)
inter_layer_pooling_size: (2, 2)
cnn_kernelsize: (3, 3)
time_pooling_size: 4
rnn_class: !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 4
rnn_neurons: 1024
rnn_bidirectional: True
dnn_blocks: 2
dnn_neurons: 512
emb_size: 128
dec_neurons: 1024
output_neurons: 2000 # Number of tokens used for tokenizer
blank_index: 0
bos_index: 1
eos_index: 2
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_beam_size: 60
test_beam_size: 80
eos_threshold: 1.5
using_max_attn_shift: True
max_attn_shift: 240
ctc_weight_decode: 0.3
coverage_penalty: 1.8
temperature: 1.25
normalizer: !new:speechbrain.processing.features.InputNormalization
norm_type: global
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
input_shape: [null, null, !ref <n_mels>]
activation: !ref <activation>
dropout: !ref <dropout>
cnn_blocks: !ref <cnn_blocks>
cnn_channels: !ref <cnn_channels>
cnn_kernelsize: !ref <cnn_kernelsize>
inter_layer_pooling_size: !ref <inter_layer_pooling_size>
time_pooling: True
using_2d_pooling: False
time_pooling_size: !ref <time_pooling_size>
rnn_class: !ref <rnn_class>
rnn_layers: !ref <rnn_layers>
rnn_neurons: !ref <rnn_neurons>
rnn_bidirectional: !ref <rnn_bidirectional>
rnn_re_init: True
dnn_blocks: !ref <dnn_blocks>
dnn_neurons: !ref <dnn_neurons>
use_rnnp: False
emb: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <output_neurons>
embedding_dim: !ref <emb_size>
dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
enc_dim: !ref <dnn_neurons>
input_size: !ref <emb_size>
rnn_type: gru
attn_type: location
hidden_size: !ref <dec_neurons>
attn_dim: 1024
num_layers: 1
scaling: 1.0
channels: 10
kernel_size: 100
re_init: True
dropout: !ref <dropout>
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dnn_neurons>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dec_neurons>
n_neurons: !ref <output_neurons>
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
tokenizer: !new:sentencepiece.SentencePieceProcessor
asr_model: !new:torch.nn.ModuleList
- [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalizer>
model: !ref <enc>
decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher
embedding: !ref <emb>
decoder: !ref <dec>
linear: !ref <seq_lin>
ctc_linear: !ref <ctc_lin>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
eos_threshold: !ref <eos_threshold>
using_max_attn_shift: !ref <using_max_attn_shift>
max_attn_shift: !ref <max_attn_shift>
coverage_penalty: !ref <coverage_penalty>
ctc_weight: !ref <ctc_weight_decode>
temperature: !ref <temperature>
modules:
normalizer: !ref <normalizer>
encoder: !ref <encoder>
decoder: !ref <decoder>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
normalizer: !ref <normalizer>
asr: !ref <asr_model>
tokenizer: !ref <tokenizer>
|