# ################################ # Model: Fastspeech2 for TTS # Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar # ################################ # Input parameters lexicon: - AA - AE - AH - AO - AW - AY - B - CH - D - DH - EH - ER - EY - F - G - HH - IH - IY - JH - K - L - M - N - NG - OW - OY - P - R - S - SH - T - TH - UH - UW - V - W - Y - Z - ZH - spn n_symbols: 41 #fixed deppending on symbols in the lexicon +1 for a dummy symbol used for padding padding_idx: 0 n_mel_channels: 80 # Encoder parameters enc_num_layers: 4 enc_num_head: 2 enc_d_model: 256 enc_ffn_dim: 1024 enc_k_dim: 256 enc_v_dim: 256 enc_dropout: 0.2 # Decoder parameters dec_num_layers: 4 dec_num_head: 2 dec_d_model: 256 dec_ffn_dim: 1024 dec_k_dim: 256 dec_v_dim: 256 dec_dropout: 0.2 # Postnet parameters postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 postnet_dropout: 0.5 # Common normalize_before: True ffn_type: 1dcnn #1dcnn or ffn ffn_cnn_kernel_size_list: [9, 1] # Variance predictor dur_pred_kernel_size: 3 pitch_pred_kernel_size: 3 energy_pred_kernel_size: 3 variance_predictor_dropout: 0.5 # Model model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2 enc_num_layers: !ref <enc_num_layers> enc_num_head: !ref <enc_num_head> enc_d_model: !ref <enc_d_model> enc_ffn_dim: !ref <enc_ffn_dim> enc_k_dim: !ref <enc_k_dim> enc_v_dim: !ref <enc_v_dim> enc_dropout: !ref <enc_dropout> dec_num_layers: !ref <dec_num_layers> dec_num_head: !ref <dec_num_head> dec_d_model: !ref <dec_d_model> dec_ffn_dim: !ref <dec_ffn_dim> dec_k_dim: !ref <dec_k_dim> dec_v_dim: !ref <dec_v_dim> dec_dropout: !ref <dec_dropout> normalize_before: !ref <normalize_before> ffn_type: !ref <ffn_type> ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list> n_char: !ref <n_symbols> n_mels: !ref <n_mel_channels> postnet_embedding_dim: !ref <postnet_embedding_dim> postnet_kernel_size: !ref <postnet_kernel_size> postnet_n_convolutions: !ref <postnet_n_convolutions> postnet_dropout: !ref <postnet_dropout> padding_idx: !ref <padding_idx> dur_pred_kernel_size: !ref <dur_pred_kernel_size> pitch_pred_kernel_size: !ref <pitch_pred_kernel_size> energy_pred_kernel_size: !ref <energy_pred_kernel_size> variance_predictor_dropout: !ref <variance_predictor_dropout> input_encoder: !new:speechbrain.dataio.encoder.TextEncoder modules: model: !ref <model> pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref <model>