File size: 2,747 Bytes
d8714d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# ################################
# Model: Fastspeech2 for TTS
# Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar
# ################################
# Input parameters
lexicon:
- AA
- AE
- AH
- AO
- AW
- AY
- B
- CH
- D
- DH
- EH
- ER
- EY
- F
- G
- HH
- IH
- IY
- JH
- K
- L
- M
- N
- NG
- OW
- OY
- P
- R
- S
- SH
- T
- TH
- UH
- UW
- V
- W
- Y
- Z
- ZH
- spn
n_symbols: 41 #fixed deppending on symbols in the lexicon +1 for a dummy symbol used for padding
padding_idx: 0
n_mel_channels: 80
# Encoder parameters
enc_num_layers: 4
enc_num_head: 2
enc_d_model: 256
enc_ffn_dim: 1024
enc_k_dim: 256
enc_v_dim: 256
enc_dropout: 0.2
# Decoder parameters
dec_num_layers: 4
dec_num_head: 2
dec_d_model: 256
dec_ffn_dim: 1024
dec_k_dim: 256
dec_v_dim: 256
dec_dropout: 0.2
# Postnet parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
postnet_dropout: 0.5
# Common
normalize_before: True
ffn_type: 1dcnn #1dcnn or ffn
ffn_cnn_kernel_size_list: [9, 1]
# Variance predictor
dur_pred_kernel_size: 3
pitch_pred_kernel_size: 3
energy_pred_kernel_size: 3
variance_predictor_dropout: 0.5
# Model
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
enc_num_layers: !ref <enc_num_layers>
enc_num_head: !ref <enc_num_head>
enc_d_model: !ref <enc_d_model>
enc_ffn_dim: !ref <enc_ffn_dim>
enc_k_dim: !ref <enc_k_dim>
enc_v_dim: !ref <enc_v_dim>
enc_dropout: !ref <enc_dropout>
dec_num_layers: !ref <dec_num_layers>
dec_num_head: !ref <dec_num_head>
dec_d_model: !ref <dec_d_model>
dec_ffn_dim: !ref <dec_ffn_dim>
dec_k_dim: !ref <dec_k_dim>
dec_v_dim: !ref <dec_v_dim>
dec_dropout: !ref <dec_dropout>
normalize_before: !ref <normalize_before>
ffn_type: !ref <ffn_type>
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
n_char: !ref <n_symbols>
n_mels: !ref <n_mel_channels>
postnet_embedding_dim: !ref <postnet_embedding_dim>
postnet_kernel_size: !ref <postnet_kernel_size>
postnet_n_convolutions: !ref <postnet_n_convolutions>
postnet_dropout: !ref <postnet_dropout>
padding_idx: !ref <padding_idx>
dur_pred_kernel_size: !ref <dur_pred_kernel_size>
pitch_pred_kernel_size: !ref <pitch_pred_kernel_size>
energy_pred_kernel_size: !ref <energy_pred_kernel_size>
variance_predictor_dropout: !ref <variance_predictor_dropout>
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
modules:
model: !ref <model>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model> |