File size: 2,778 Bytes
e657e39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

########### model config ###########
generator:
  name: SoundStream
  config:
    n_filters: 32
    D: 256
    #target_bandwidths: [6,] # [1, 1.5, 2, 4, 6] # [0.5, 1, 1.5, 2, 4, 6]
    target_bandwidths: [0.5, 1, 1.5, 2, 4]
    ratios: [8, 5, 4, 2] # downsampling by 320
    sample_rate: 16000
    bins: 1024

# Discriminator list
#d_list: ['mpd', 'msd', 'mfd']
d_list: ['mfd']

mfd:
  name: MultiFrequencyDiscriminator
  config:
    hop_lengths: [32, 64, 128, 256, 512, 1024]
    hidden_channels: [64, 128, 256, 512, 512, 512]
    domain: double
    mel_scale: true
    sample_rate: 16000

mpd:
  name: MultiPeriodDiscriminator
  config:
    period_sizes: [2, 3, 5, 7, 11]
    period_kernel_size: 5

msd:
  name: MultiScaleDiscriminator
  config:
    num_scales: 3
    pool_kernel_size: 4 
    pool_stride: 2

########### optimizer config ###########
optimizer:
  g:
    name: AdamW
    config:
      lr: 2e-4
      betas: [0.8, 0.99]
      eps: 1.0e-6

  d:
    name: AdamW
    config:
      lr: 2e-4
      betas: [0.8, 0.99]
      eps: 1.0e-6

lr_scheduler:
  g:
    name: ExponentialLR
    config:
      gamma: 0.999
  d:
    name: ExponentialLR
    config:
      gamma: 0.999

########### criterion config ###########
criterion:
  g_criterion:
    name: losses.generator_loss.GeneratorSTFTLoss
    config:
      use_mel_loss: false
      #adv_criterion: LeastDLoss
      adv_criterion: MSEGLoss
      mel_loss_weight: 45
      use_feature_match: true
      feat_match_loss_weight: 20
      use_full_stft_loss: true # Magnitude 
      use_sub_stft_loss: true  # PQMF loss
      full_stft_loss_weight: 1
      sub_stft_loss_weight: 1
      mel_scale_loss:
        sampling_rate: 16000
        n_fft: 1024
        num_mels: 80
        hop_size: 160
        win_size: 800
        fmin: 0
      full_multi_scale_stft_loss: # Full-band multi-scale STFT loss.
        fft_sizes: [512, 1024, 2048]
        win_sizes: [480, 960, 1200]
        hop_sizes: [120, 240, 300]
      sub_multi_scale_stft_loss: # Sub-band multi-scale STFT loss.
        num_bands: 6
        fft_sizes: [128, 256, 256]
        win_sizes: [80, 120, 200]
        hop_sizes: [20, 40, 50]
  
  d_criterion:
    name: losses.discriminator_loss.MSEDiscriminatorLoss
    config: null

  commit_loss_weight: 1. #1000

########### training and data config ###########

seed: 2333
cudnn_deterministic: false
tensorboard: true # whether to use tensorboard
#checkpoint_interval: 5
#summary_interval: 10
#validation_interval: 10

checkpoint_interval: 5000
summary_interval: 100
validation_interval: 5000

num_epoches: 5000
print_freq: 10
discriminator_iter_start: 0  # start step after which we update discriminators
num_ckpt_keep: 10

segment_size: 24000
audio_norm_scale: 1.0
batch_size: 6
num_workers: 8
num_plots: 8