F0_path: "weights/jdc.bin"
ASR_config: "Utils/ASR/config.yml"
ASR_path: "weights/asr.bin"


model_params_multi:
  multispeaker: true

  dim_in: 64 
  hidden_dim: 512
  max_conv_dim: 512
  n_layer: 3
  n_mels: 80

  n_token: 181 # number of phoneme tokens
  max_dur: 50 # maximum duration of a single phoneme
  style_dim: 128 # style vector size
  
  dropout: 0.2

  # config for decoder
  decoder: 
      type: 'hifigan' # either hifigan or istftnet
      resblock_kernel_sizes: [3,7,11]
      upsample_rates :  [10,5,3,2]
      upsample_initial_channel: 512
      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
      upsample_kernel_sizes: [20,10,6,4]
      
  # speech language model config
  slm:
      model: ''
      sr: 16000 # sampling rate of SLM
      hidden: 768 # hidden size of SLM
      nlayers: 13 # number of layers of SLM
      initial_channel: 64 # initial channels of SLM discriminator head
  
  # style diffusion model config
  diffusion:
    embedding_mask_proba: 0.1
    # transformer config
    transformer:
      num_layers: 3
      num_heads: 8
      head_features: 64
      multiplier: 2

    # diffusion distribution config
    dist:
      sigma_data: 0.19988229232390187 # placeholder for estimate_sigma_data set to false
      estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
      mean: -3.0
      std: 1.0
  
model_params_single:
  multispeaker: false

  dim_in: 64 
  hidden_dim: 512
  max_conv_dim: 512
  n_layer: 3
  n_mels: 80

  n_token: 181 # number of phoneme tokens
  max_dur: 50 # maximum duration of a single phoneme
  style_dim: 128 # style vector size
  
  dropout: 0.2

  # config for decoder
  decoder: 
      type: 'istftnet' # either hifigan or istftnet
      resblock_kernel_sizes: [3,7,11]
      upsample_rates :  [10, 6]
      upsample_initial_channel: 512
      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
      upsample_kernel_sizes: [20, 12]
      gen_istft_n_fft: 20
      gen_istft_hop_size: 5
      
  # speech language model config
  slm:
      model: 'openai/whisper-medium'
      sr: 16000 # sampling rate of SLM
      hidden: 768 # hidden size of SLM
      nlayers: 13 # number of layers of SLM
      initial_channel: 64 # initial channels of SLM discriminator head
  
  # style diffusion model config
  diffusion:
    embedding_mask_proba: 0.1
    # transformer config
    transformer:
      num_layers: 3
      num_heads: 8
      head_features: 64
      multiplier: 2

    # diffusion distribution config
    dist:
      sigma_data: 0.18 # placeholder for estimate_sigma_data set to false
      estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
      mean: -3.0
      std: 1.0