F0_path: "weights/jdc.bin" ASR_config: "Utils/ASR/config.yml" ASR_path: "weights/asr.bin" model_params_multi: multispeaker: true dim_in: 64 hidden_dim: 512 max_conv_dim: 512 n_layer: 3 n_mels: 80 n_token: 181 # number of phoneme tokens max_dur: 50 # maximum duration of a single phoneme style_dim: 128 # style vector size dropout: 0.2 # config for decoder decoder: type: 'hifigan' # either hifigan or istftnet resblock_kernel_sizes: [3,7,11] upsample_rates : [10,5,3,2] upsample_initial_channel: 512 resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] upsample_kernel_sizes: [20,10,6,4] # speech language model config slm: model: '' sr: 16000 # sampling rate of SLM hidden: 768 # hidden size of SLM nlayers: 13 # number of layers of SLM initial_channel: 64 # initial channels of SLM discriminator head # style diffusion model config diffusion: embedding_mask_proba: 0.1 # transformer config transformer: num_layers: 3 num_heads: 8 head_features: 64 multiplier: 2 # diffusion distribution config dist: sigma_data: 0.19988229232390187 # placeholder for estimate_sigma_data set to false estimate_sigma_data: true # estimate sigma_data from the current batch if set to true mean: -3.0 std: 1.0 model_params_single: multispeaker: false dim_in: 64 hidden_dim: 512 max_conv_dim: 512 n_layer: 3 n_mels: 80 n_token: 181 # number of phoneme tokens max_dur: 50 # maximum duration of a single phoneme style_dim: 128 # style vector size dropout: 0.2 # config for decoder decoder: type: 'istftnet' # either hifigan or istftnet resblock_kernel_sizes: [3,7,11] upsample_rates : [10, 6] upsample_initial_channel: 512 resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] upsample_kernel_sizes: [20, 12] gen_istft_n_fft: 20 gen_istft_hop_size: 5 # speech language model config slm: model: 'openai/whisper-medium' sr: 16000 # sampling rate of SLM hidden: 768 # hidden size of SLM nlayers: 13 # number of layers of SLM initial_channel: 64 # initial channels of SLM discriminator head # style diffusion model config diffusion: embedding_mask_proba: 0.1 # transformer config transformer: num_layers: 3 num_heads: 8 head_features: 64 multiplier: 2 # diffusion distribution config dist: sigma_data: 0.18 # placeholder for estimate_sigma_data set to false estimate_sigma_data: true # estimate sigma_data from the current batch if set to true mean: -3.0 std: 1.0