File size: 4,514 Bytes
e8b63e8 9e5941e e8b63e8 9e5941e 0a69b53 da3c41d e8b63e8 da3c41d e8b63e8 bf677fa e8b63e8 9e5941e e8b63e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Transformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
# Tokens: unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: Librispeech 960h
# Authors: Jianyuan Zhong, Titouan Parcollet 2021
# ############################################################################
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
####################### Model parameters ###########################
# Transformer
d_model: 768
nhead: 8
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 3072
transformer_dropout: 0.0
activation: !name:torch.nn.GELU
output_neurons: 5000
vocab_size: 5000
# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
unk_index: 0
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 10
valid_beam_size: 10
test_beam_size: 66
lm_weight: 0.60
ctc_weight_decode: 0.52
############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 3
num_layers_per_block: 1
out_channels: (128, 256, 512)
kernel_sizes: (3, 3, 1)
strides: (2, 2, 1)
residuals: (False, False, False)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
input_size: 10240
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
normalize_before: False
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
ctc_weight: !ref <ctc_weight_decode>
lm_weight: !ref <lm_weight>
lm_modules: !ref <lm_model>
temperature: 1.15
temperature_lm: 1.15
using_eos_threshold: False
length_normalization: True
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
normalize: !new:speechbrain.processing.features.InputNormalization
norm_type: global
update_until_epoch: 4
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
# This is the Transformer LM that is used according to the Huggingface repository
# Visit the HuggingFace model corresponding to the pretrained_lm_tokenizer_path
# For more details about the model!
# NB: It has to match the pre-trained TransformerLM!!
lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
vocab: 5000
d_model: 768
nhead: 12
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 3072
dropout: 0.0
activation: !name:torch.nn.GELU
normalize_before: False
tokenizer: !new:sentencepiece.SentencePieceProcessor
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalize>
cnn: !ref <CNN>
transformer_encoder: !ref <Tencoder>
# Models
asr_model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
modules:
compute_features: !ref <compute_features>
pre_transformer: !ref <CNN>
transformer: !ref <Transformer>
asr_model: !ref <asr_model>
normalize: !ref <normalize>
lm_model: !ref <lm_model>
encoder: !ref <encoder>
decoder: !ref <decoder>
# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
asr: !ref <asr_model>
lm: !ref <lm_model>
tokenizer: !ref <tokenizer>
|