# ############################################################################ # Model: WAV2VEC base for Emotion Recognition ############################################################################ # Hparams NEEDED HPARAMS_NEEDED: [ "out_n_neurons", "label_encoder", ] # Modules Needed MODULES_NEEDED: ["transf", "avg_pool", "enc", 'classifier'] # Feature parameters wavlm_hub: "microsoft/wavlm-large" # Pretrain folder (HuggingFace) pretrained_path: "mtauro/wavlm_vrs_ck_iva_k492" # parameters #encoder_dim: 768 out_n_neurons: 2 transf: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2 source: !ref save_path: wavlm_checkpoint avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling return_std: False enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] dnn_blocks: 1 dnn_neurons: 1024 # this will be output size of 3rd dimension classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier input_size: 1024 #192 for ecapa, double number of channels out_neurons: !ref softmax: !new:speechbrain.nnet.activations.Softmax model: !new:torch.nn.ModuleList - [!ref , !ref , !ref ] modules: transf: !ref avg_pool: !ref enc: !ref classifier: !ref label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: transf: !ref avg_pool: !ref enc: !ref classifier: !ref label_encoder: !ref paths: transf: !ref /transf.ckpt avg_pool: !ref /avg_pool.ckpt enc: !ref /enc.ckpt classifier: !ref /classifier.ckpt label_encoder: !ref /label_encoder.txt