speechbrain
/

asr-transformer-commonvoice-14-fr

@@ -1,5 +1,5 @@
 # ################################
-# Model: Transducer ASR
 # Augmentation: SpecAugment
 # Authors: Pooneh Mousavi 2023
 # ################################
@@ -9,41 +9,32 @@ n_fft: 400
 n_mels: 80
 # Model parameters
-activation: !name:torch.nn.LeakyReLU
-dropout: 0.15
-cnn_blocks: 3
-cnn_channels: (128, 200, 256)
-inter_layer_pooling_size: (2, 2, 2)
-cnn_kernelsize: (3, 3)
-time_pooling_size: 4
-rnn_class: !name:speechbrain.nnet.RNN.LSTM
-rnn_layers: 5
-rnn_neurons: 1024
-rnn_bidirectional: True
-dnn_blocks: 2
-dnn_neurons: 1024
-dec_neurons: 1024
-joint_dim: 1024
 # Outputs
-output_neurons: 1000  # BPE size, index(blank/eos/bos) = 0
-# transducer_beam_search : True
-# Decoding parameters
-# Be sure that the bos and eos index match with the BPEs ones
 blank_index: 0
-bos_index: 0
-eos_index: 0
 min_decode_ratio: 0.0
 max_decode_ratio: 1.0
-beam_size: 4
-nbest: 1
-# by default {state,expand}_beam = 2.3 as mention in paper
-# https://arxiv.org/abs/1904.02619
-state_beam: 2.3
-expand_beam: 2.3
-transducer_beam_search: True
 normalizer: !new:speechbrain.processing.features.InputNormalization
     norm_type: global
@@ -53,61 +44,53 @@ compute_features: !new:speechbrain.lobes.features.Fbank
     n_fft: !ref <n_fft>
     n_mels: !ref <n_mels>
-enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
-    input_shape: [null, null, !ref <n_mels>]
     activation: !ref <activation>
-    dropout: !ref <dropout>
-    cnn_blocks: !ref <cnn_blocks>
-    cnn_channels: !ref <cnn_channels>
-    cnn_kernelsize: !ref <cnn_kernelsize>
-    inter_layer_pooling_size: !ref <inter_layer_pooling_size>
-    time_pooling: True
-    using_2d_pooling: False
-    time_pooling_size: !ref <time_pooling_size>
-    rnn_class: !ref <rnn_class>
-    rnn_layers: !ref <rnn_layers>
-    rnn_neurons: !ref <rnn_neurons>
-    rnn_bidirectional: !ref <rnn_bidirectional>
-    rnn_re_init: True
-    dnn_blocks: !ref <dnn_blocks>
-    dnn_neurons: !ref <dnn_neurons>
-enc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: !ref <dnn_neurons>
-   n_neurons: !ref <joint_dim>
-emb: !new:speechbrain.nnet.embedding.Embedding
-    num_embeddings: !ref <output_neurons>
-    consider_as_one_hot: True
-    blank_id: !ref <blank_index>
-dec: !new:speechbrain.nnet.RNN.GRU
-   input_shape: [null, null, !ref <output_neurons> - 1]
-   hidden_size: !ref <dec_neurons>
-   num_layers: 1
-   re_init: True
-# For MTL with LM over the decoder
-dec_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: !ref <dec_neurons>
-   n_neurons: !ref <joint_dim>
-   bias: False
-Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
-   joint: sum # joint [sum | concat]
-   nonlinearity: !ref <activation>
-transducer_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: !ref <joint_dim>
-   n_neurons: !ref <output_neurons>
-   bias: False
 log_softmax: !new:speechbrain.nnet.activations.Softmax
     apply_log: True
 asr_model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <transducer_lin>]
 tokenizer: !new:sentencepiece.SentencePieceProcessor
@@ -116,21 +99,24 @@ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
     input_shape: [null, null, !ref <n_mels>]
     compute_features: !ref <compute_features>
     normalize: !ref <normalizer>
-    model: !ref <enc>
-decoder: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
-   decode_network_lst: [!ref <emb>, !ref <dec>]
-   tjoint: !ref <Tjoint>
-   classifier_network: [!ref <transducer_lin>]
-   blank_id: !ref <blank_index>
-   beam_size: !ref <beam_size>
-   nbest: !ref <nbest>
-   state_beam: !ref <state_beam>
-   expand_beam: !ref <expand_beam>
 modules:
     normalizer: !ref <normalizer>
     encoder: !ref <encoder>
     decoder: !ref <decoder>
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer

 # ################################
+# Model: Transformer ASR
 # Augmentation: SpecAugment
 # Authors: Pooneh Mousavi 2023
 # ################################
 n_mels: 80
 # Model parameters
+# Transformer
+d_model: 768
+nhead: 8
+num_encoder_layers: 12
+num_decoder_layers: 6
+d_ffn: 3072
+transformer_dropout: 0.0
+activation: !name:torch.nn.GELU
+output_neurons: 500
 # Outputs
 blank_index: 0
+label_smoothing: 0.1
+pad_index: 0
+bos_index: 1
+eos_index: 2
+# Decoding parameters
 min_decode_ratio: 0.0
 max_decode_ratio: 1.0
+valid_search_interval: 5
+valid_beam_size: 10
+test_beam_size: 80
+ctc_weight_decode: 0.3
+scorer_beam_scale: 0.3
+transformer_beam_search: True
 normalizer: !new:speechbrain.processing.features.InputNormalization
     norm_type: global
     n_fft: !ref <n_fft>
     n_mels: !ref <n_mels>
+CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
+    input_shape: (8, 10, 80)
+    num_blocks: 3
+    num_layers_per_block: 1
+    out_channels: (128, 200, 256)
+    kernel_sizes: (3, 3, 1)
+    strides: (2, 2, 1)
+    residuals: (False, False, False)
+Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
+    input_size: 5120
+    tgt_vocab: !ref <output_neurons>
+    d_model: !ref <d_model>
+    nhead: !ref <nhead>
+    num_encoder_layers: !ref <num_encoder_layers>
+    num_decoder_layers: !ref <num_decoder_layers>
+    d_ffn: !ref <d_ffn>
+    dropout: !ref <transformer_dropout>
     activation: !ref <activation>
+    normalize_before: False
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <d_model>
+    n_neurons: !ref <output_neurons>
+seq_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <d_model>
+    n_neurons: !ref <output_neurons>
 log_softmax: !new:speechbrain.nnet.activations.Softmax
     apply_log: True
+# Scorer
+ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
+    eos_index: !ref <eos_index>
+    blank_index: !ref <blank_index>
+    ctc_fc: !ref <ctc_lin>
+scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
+    full_scorers: [!ref <ctc_scorer>]
+    weights:
+        ctc: !ref <ctc_weight_decode>
+    scorer_beam_scale: !ref <scorer_beam_scale>
 asr_model: !new:torch.nn.ModuleList
+    - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
 tokenizer: !new:sentencepiece.SentencePieceProcessor
     input_shape: [null, null, !ref <n_mels>]
     compute_features: !ref <compute_features>
     normalize: !ref <normalizer>
+    CNN: !ref <CNN>
+decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
+    modules: [!ref <Transformer>, !ref <seq_lin>]
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <test_beam_size>
+    temperature: 1.15
+    using_eos_threshold: True
+    scorer: !ref <scorer>
 modules:
     normalizer: !ref <normalizer>
     encoder: !ref <encoder>
+    transformer: !ref <Transformer>
     decoder: !ref <decoder>
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer