# ################################ # Model: Fastspeech2 for TTS # Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar # ################################ # Input parameters lexicon: - AA - AE - AH - AO - AW - AY - B - CH - D - DH - EH - ER - EY - F - G - HH - IH - IY - JH - K - L - M - N - NG - OW - OY - P - R - S - SH - T - TH - UH - UW - V - W - Y - Z - ZH - spn n_symbols: 42 #fixed deppending on symbols in the lexicon +1 for a dummy symbol used for padding padding_idx: 0 n_mel_channels: 80 sample_rate: 22050 # Encoder parameters enc_num_layers: 4 enc_num_head: 2 enc_d_model: 384 enc_ffn_dim: 1024 enc_k_dim: 384 enc_v_dim: 384 enc_dropout: 0.2 # Decoder parameters dec_num_layers: 4 dec_num_head: 2 dec_d_model: 384 dec_ffn_dim: 1024 dec_k_dim: 384 dec_v_dim: 384 dec_dropout: 0.2 # Postnet parameters postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 postnet_dropout: 0.5 # Common normalize_before: True ffn_type: 1dcnn #1dcnn or ffn ffn_cnn_kernel_size_list: [9, 1] # Variance predictor dur_pred_kernel_size: 3 pitch_pred_kernel_size: 3 energy_pred_kernel_size: 3 variance_predictor_dropout: 0.5 # silent phoneme token predictor spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor enc_num_layers: !ref enc_num_head: !ref enc_d_model: !ref enc_ffn_dim: !ref enc_k_dim: !ref enc_v_dim: !ref enc_dropout: !ref normalize_before: !ref ffn_type: !ref ffn_cnn_kernel_size_list: !ref n_char: !ref padding_idx: !ref # Model model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2 enc_num_layers: !ref enc_num_head: !ref enc_d_model: !ref enc_ffn_dim: !ref enc_k_dim: !ref enc_v_dim: !ref enc_dropout: !ref dec_num_layers: !ref dec_num_head: !ref dec_d_model: !ref dec_ffn_dim: !ref dec_k_dim: !ref dec_v_dim: !ref dec_dropout: !ref normalize_before: !ref ffn_type: !ref ffn_cnn_kernel_size_list: !ref n_char: !ref n_mels: !ref postnet_embedding_dim: !ref postnet_kernel_size: !ref postnet_n_convolutions: !ref postnet_dropout: !ref padding_idx: !ref dur_pred_kernel_size: !ref pitch_pred_kernel_size: !ref energy_pred_kernel_size: !ref variance_predictor_dropout: !ref input_encoder: !new:speechbrain.dataio.encoder.TextEncoder modules: spn_predictor: !ref model: !ref pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: spn_predictor: !ref model: !ref