Rongjiehuang commited on
Commit
64e7f2f
·
0 Parent(s):
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +32 -0
  2. .gitignore +151 -0
  3. LICENSE +21 -0
  4. README.md +10 -0
  5. checkpoints/FastDiff/config.yaml +149 -0
  6. checkpoints/FastDiff/model_ckpt_steps_500000.ckpt +3 -0
  7. checkpoints/ProDiff/config.yaml +205 -0
  8. checkpoints/ProDiff/model_ckpt_steps_200000.ckpt +3 -0
  9. checkpoints/ProDiff_Teacher/config.yaml +205 -0
  10. checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt +3 -0
  11. data/binary/LJSpeech/phone_set.json +1 -0
  12. data/binary/LJSpeech/spk_map.json +1 -0
  13. data/binary/LJSpeech/train_f0s_mean_std.npy +3 -0
  14. data_gen/tts/base_binarizer.py +224 -0
  15. data_gen/tts/base_preprocess.py +245 -0
  16. data_gen/tts/bin/binarize.py +20 -0
  17. data_gen/tts/data_gen_utils.py +352 -0
  18. data_gen/tts/txt_processors/__init__.py +1 -0
  19. data_gen/tts/txt_processors/base_text_processor.py +47 -0
  20. data_gen/tts/txt_processors/en.py +77 -0
  21. data_gen/tts/wav_processors/__init__.py +2 -0
  22. data_gen/tts/wav_processors/base_processor.py +25 -0
  23. data_gen/tts/wav_processors/common_processors.py +86 -0
  24. egs/datasets/audio/libritts/base_text2mel.yaml +14 -0
  25. egs/datasets/audio/libritts/fs2.yaml +3 -0
  26. egs/datasets/audio/libritts/pre_align.py +18 -0
  27. egs/datasets/audio/libritts/pwg.yaml +8 -0
  28. egs/datasets/audio/lj/base_mel2wav.yaml +5 -0
  29. egs/datasets/audio/lj/pre_align.py +13 -0
  30. egs/datasets/audio/lj/pwg.yaml +3 -0
  31. egs/datasets/audio/vctk/base_mel2wav.yaml +3 -0
  32. egs/datasets/audio/vctk/fs2.yaml +12 -0
  33. egs/datasets/audio/vctk/pre_align.py +22 -0
  34. egs/datasets/audio/vctk/pwg.yaml +6 -0
  35. egs/egs_bases/config_base.yaml +46 -0
  36. egs/egs_bases/tts/base.yaml +112 -0
  37. egs/egs_bases/tts/fs2.yaml +102 -0
  38. egs/egs_bases/tts/vocoder/base.yaml +34 -0
  39. egs/egs_bases/tts/vocoder/pwg.yaml +82 -0
  40. inference/ProDiff.py +49 -0
  41. inference/ProDiff_Teacher.py +41 -0
  42. inference/base_tts_infer.py +167 -0
  43. inference/gradio/gradio_settings.yaml +41 -0
  44. inference/gradio/infer.py +69 -0
  45. modules/FastDiff/config/FastDiff.yaml +7 -0
  46. modules/FastDiff/config/FastDiff_libritts.yaml +7 -0
  47. modules/FastDiff/config/FastDiff_sc09.yaml +25 -0
  48. modules/FastDiff/config/FastDiff_tacotron.yaml +58 -0
  49. modules/FastDiff/config/FastDiff_vctk.yaml +7 -0
  50. modules/FastDiff/config/base.yaml +157 -0
.gitattributes ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Project ignore
2
+
3
+ /ParallelWaveGAN
4
+ /wavegan_pretrained*
5
+ /pretrained_models
6
+ rsync
7
+ .idea
8
+ .DS_Store
9
+ bak
10
+ tmp
11
+ *.tar.gz
12
+ # mfa and kaldi
13
+ kaldi_align/exp
14
+ mfa
15
+ montreal-forced-aligner
16
+ mos
17
+ nbs
18
+ /configs_usr/*
19
+ !/configs_usr/.gitkeep
20
+ /fast_transformers
21
+ /rnnoise
22
+ /usr/*
23
+ !/usr/.gitkeep
24
+
25
+ # Created by .ignore support plugin (hsz.mobi)
26
+ ### Python template
27
+ # Byte-compiled / optimized / DLL files
28
+ __pycache__/
29
+ *.py[cod]
30
+ *$py.class
31
+
32
+ # C extensions
33
+ *.so
34
+
35
+ # Distribution / packaging
36
+ .Python
37
+ build/
38
+ develop-eggs/
39
+ dist/
40
+ downloads/
41
+ eggs/
42
+ .eggs/
43
+ lib/
44
+ lib64/
45
+ parts/
46
+ sdist/
47
+ var/
48
+ wheels/
49
+ pip-wheel-metadata/
50
+ share/python-wheels/
51
+ *.egg-info/
52
+ .installed.cfg
53
+ *.egg
54
+ MANIFEST
55
+
56
+ # PyInstaller
57
+ # Usually these files are written by a python script from a template
58
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
59
+ *.manifest
60
+ *.spec
61
+
62
+ # Installer logs
63
+ pip-log.txt
64
+ pip-delete-this-directory.txt
65
+
66
+ # Unit test / coverage reports
67
+ htmlcov/
68
+ .tox/
69
+ .nox/
70
+ .coverage
71
+ .coverage.*
72
+ .cache
73
+ nosetests.xml
74
+ coverage.xml
75
+ *.cover
76
+ .hypothesis/
77
+ .pytest_cache/
78
+
79
+ # Translations
80
+ *.mo
81
+ *.pot
82
+
83
+ # Django stuff:
84
+ *.log
85
+ local_settings.py
86
+ db.sqlite3
87
+ db.sqlite3-journal
88
+
89
+ # Flask stuff:
90
+ instance/
91
+ .webassets-cache
92
+
93
+ # Scrapy stuff:
94
+ .scrapy
95
+
96
+ # Sphinx documentation
97
+ docs/_build/
98
+
99
+ # PyBuilder
100
+ target/
101
+
102
+ # Jupyter Notebook
103
+ .ipynb_checkpoints
104
+
105
+ # IPython
106
+ profile_default/
107
+ ipython_config.py
108
+
109
+ # pyenv
110
+ .python-version
111
+
112
+ # pipenv
113
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
114
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
115
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
116
+ # install all needed dependencies.
117
+ #Pipfile.lock
118
+
119
+ # celery beat schedule file
120
+ celerybeat-schedule
121
+
122
+ # SageMath parsed files
123
+ *.sage.py
124
+
125
+ # Environments
126
+ .env
127
+ .venv
128
+ env/
129
+ venv/
130
+ ENV/
131
+ env.bak/
132
+ venv.bak/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # Pyre type checker
150
+ .pyre/
151
+ 将删除 datasets/remi/test/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Jinglin Liu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ProDiff
3
+ emoji: 🤗
4
+ colorFrom: yellow
5
+ colorTo: orange
6
+ sdk: gradio
7
+ app_file: "inference/gradio/infer.py"
8
+ pinned: false
9
+ ---
10
+
checkpoints/FastDiff/config.yaml ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ N: ''
2
+ T: 1000
3
+ accumulate_grad_batches: 1
4
+ amp: false
5
+ audio_channels: 1
6
+ audio_num_mel_bins: 80
7
+ audio_sample_rate: 22050
8
+ aux_context_window: 0
9
+ beta_0: 1.0e-06
10
+ beta_T: 0.01
11
+ binarization_args:
12
+ reset_phone_dict: true
13
+ reset_word_dict: true
14
+ shuffle: false
15
+ trim_eos_bos: false
16
+ with_align: false
17
+ with_f0: false
18
+ with_f0cwt: false
19
+ with_linear: false
20
+ with_spk_embed: false
21
+ with_spk_id: true
22
+ with_txt: false
23
+ with_wav: true
24
+ with_word: false
25
+ binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
26
+ binary_data_dir: data/binary/LJSpeech
27
+ check_val_every_n_epoch: 10
28
+ clip_grad_norm: 1
29
+ clip_grad_value: 0
30
+ cond_channels: 80
31
+ debug: false
32
+ dec_ffn_kernel_size: 9
33
+ dec_layers: 4
34
+ dict_dir: ''
35
+ diffusion_step_embed_dim_in: 128
36
+ diffusion_step_embed_dim_mid: 512
37
+ diffusion_step_embed_dim_out: 512
38
+ disc_start_steps: 40000
39
+ discriminator_grad_norm: 1
40
+ dropout: 0.0
41
+ ds_workers: 1
42
+ enc_ffn_kernel_size: 9
43
+ enc_layers: 4
44
+ endless_ds: true
45
+ eval_max_batches: -1
46
+ ffn_act: gelu
47
+ ffn_padding: SAME
48
+ fft_size: 1024
49
+ fmax: 7600
50
+ fmin: 80
51
+ frames_multiple: 1
52
+ gen_dir_name: ''
53
+ generator_grad_norm: 10
54
+ griffin_lim_iters: 60
55
+ hidden_size: 256
56
+ hop_size: 256
57
+ infer: false
58
+ inner_channels: 32
59
+ kpnet_conv_size: 3
60
+ kpnet_hidden_channels: 64
61
+ load_ckpt: ''
62
+ loud_norm: false
63
+ lr: 2e-4
64
+ lvc_kernel_size: 3
65
+ lvc_layers_each_block: 4
66
+ max_epochs: 1000
67
+ max_frames: 1548
68
+ max_input_tokens: 1550
69
+ max_samples: 25600
70
+ max_sentences: 20
71
+ max_tokens: 30000
72
+ max_updates: 1000000
73
+ max_valid_sentences: 1
74
+ max_valid_tokens: 60000
75
+ mel_loss: l1
76
+ mel_vmax: 1.5
77
+ mel_vmin: -6
78
+ mfa_version: 2
79
+ min_frames: 0
80
+ min_level_db: -100
81
+ noise_schedule: ''
82
+ num_ckpt_keep: 3
83
+ num_heads: 2
84
+ num_mels: 80
85
+ num_sanity_val_steps: -1
86
+ num_spk: 400
87
+ num_test_samples: 0
88
+ num_valid_plots: 10
89
+ optimizer_adam_beta1: 0.9
90
+ optimizer_adam_beta2: 0.98
91
+ out_wav_norm: false
92
+ pitch_extractor: parselmouth
93
+ pre_align_args:
94
+ allow_no_txt: false
95
+ denoise: false
96
+ nsample_per_mfa_group: 1000
97
+ sox_resample: false
98
+ sox_to_wav: false
99
+ trim_sil: false
100
+ txt_processor: en
101
+ use_tone: true
102
+ pre_align_cls: egs.datasets.audio.pre_align.PreAlign
103
+ print_nan_grads: false
104
+ processed_data_dir: data/processed/LJSpeech
105
+ profile_infer: false
106
+ raw_data_dir: data/raw/LJSpeech-1.1
107
+ ref_level_db: 20
108
+ rename_tmux: true
109
+ resume_from_checkpoint: 0
110
+ save_best: true
111
+ save_codes: []
112
+ save_f0: false
113
+ save_gt: true
114
+ scheduler: rsqrt
115
+ seed: 1234
116
+ sort_by_len: true
117
+ task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
118
+ tb_log_interval: 100
119
+ test_ids: []
120
+ test_input_dir: ''
121
+ test_mel_dir: ''
122
+ test_num: 100
123
+ test_set_name: test
124
+ train_set_name: train
125
+ train_sets: ''
126
+ upsample_ratios:
127
+ - 8
128
+ - 8
129
+ - 4
130
+ use_pitch_embed: false
131
+ use_spk_embed: false
132
+ use_spk_id: false
133
+ use_split_spk_id: false
134
+ use_wav: true
135
+ use_weight_norm: true
136
+ use_word_input: false
137
+ val_check_interval: 2000
138
+ valid_infer_interval: 10000
139
+ valid_monitor_key: val_loss
140
+ valid_monitor_mode: min
141
+ valid_set_name: valid
142
+ vocoder_denoise_c: 0.0
143
+ warmup_updates: 8000
144
+ weight_decay: 0
145
+ win_length: null
146
+ win_size: 1024
147
+ window: hann
148
+ word_size: 30000
149
+ work_dir: checkpoints/FastDiff
checkpoints/FastDiff/model_ckpt_steps_500000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee7b6022e525c71a6025b41eeeafff9d6186b52cba76b580d6986bc8674902f3
3
+ size 183951271
checkpoints/ProDiff/config.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ amp: false
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 22050
5
+ base_config:
6
+ - ./base.yaml
7
+ binarization_args:
8
+ reset_phone_dict: true
9
+ reset_word_dict: true
10
+ shuffle: false
11
+ trim_eos_bos: false
12
+ trim_sil: false
13
+ with_align: true
14
+ with_f0: true
15
+ with_f0cwt: false
16
+ with_linear: false
17
+ with_spk_embed: false
18
+ with_spk_id: true
19
+ with_txt: true
20
+ with_wav: false
21
+ with_word: true
22
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
23
+ binary_data_dir: data/binary/LJSpeech
24
+ check_val_every_n_epoch: 10
25
+ clip_grad_norm: 1
26
+ clip_grad_value: 0
27
+ conv_use_pos: false
28
+ cwt_add_f0_loss: false
29
+ cwt_hidden_size: 128
30
+ cwt_layers: 2
31
+ cwt_loss: l1
32
+ cwt_std_scale: 0.8
33
+ debug: false
34
+ dec_dilations:
35
+ - 1
36
+ - 1
37
+ - 1
38
+ - 1
39
+ dec_ffn_kernel_size: 9
40
+ dec_inp_add_noise: false
41
+ dec_kernel_size: 5
42
+ dec_layers: 4
43
+ dec_num_heads: 2
44
+ decoder_rnn_dim: 0
45
+ decoder_type: fft
46
+ dict_dir: ''
47
+ diff_decoder_type: wavenet
48
+ diff_loss_type: l1
49
+ dilation_cycle_length: 1
50
+ dropout: 0.1
51
+ ds_workers: 2
52
+ dur_enc_hidden_stride_kernel:
53
+ - 0,2,3
54
+ - 0,2,3
55
+ - 0,1,3
56
+ dur_loss: mse
57
+ dur_predictor_kernel: 3
58
+ dur_predictor_layers: 2
59
+ enc_dec_norm: ln
60
+ enc_dilations:
61
+ - 1
62
+ - 1
63
+ - 1
64
+ - 1
65
+ enc_ffn_kernel_size: 9
66
+ enc_kernel_size: 5
67
+ enc_layers: 4
68
+ encoder_K: 8
69
+ encoder_type: fft
70
+ endless_ds: true
71
+ ffn_act: gelu
72
+ ffn_hidden_size: 1024
73
+ ffn_padding: SAME
74
+ fft_size: 1024
75
+ fmax: 7600
76
+ fmin: 80
77
+ frames_multiple: 1
78
+ gen_dir_name: ''
79
+ gen_tgt_spk_id: -1
80
+ griffin_lim_iters: 60
81
+ hidden_size: 256
82
+ hop_size: 256
83
+ infer: false
84
+ keep_bins: 80
85
+ lambda_commit: 0.25
86
+ lambda_energy: 0.1
87
+ lambda_f0: 1.0
88
+ lambda_ph_dur: 0.1
89
+ lambda_sent_dur: 1.0
90
+ lambda_uv: 1.0
91
+ lambda_word_dur: 1.0
92
+ layers_in_block: 2
93
+ load_ckpt: ''
94
+ loud_norm: false
95
+ lr: 1.0
96
+ max_beta: 0.06
97
+ max_epochs: 1000
98
+ max_frames: 1548
99
+ max_input_tokens: 1550
100
+ max_sentences: 48
101
+ max_tokens: 32000
102
+ max_updates: 200000
103
+ max_valid_sentences: 1
104
+ max_valid_tokens: 60000
105
+ mel_loss: ssim:0.5|l1:0.5
106
+ mel_vmax: 1.5
107
+ mel_vmin: -6
108
+ min_frames: 0
109
+ min_level_db: -100
110
+ num_ckpt_keep: 3
111
+ num_heads: 2
112
+ num_sanity_val_steps: -1
113
+ num_spk: 1
114
+ num_test_samples: 0
115
+ num_valid_plots: 10
116
+ optimizer_adam_beta1: 0.9
117
+ optimizer_adam_beta2: 0.98
118
+ out_wav_norm: false
119
+ pitch_ar: false
120
+ pitch_embed_type: 0
121
+ pitch_enc_hidden_stride_kernel:
122
+ - 0,2,5
123
+ - 0,2,5
124
+ - 0,2,5
125
+ pitch_extractor: parselmouth
126
+ pitch_loss: l1
127
+ pitch_norm: standard
128
+ pitch_ssim_win: 11
129
+ pitch_type: frame
130
+ pre_align_args:
131
+ allow_no_txt: false
132
+ denoise: false
133
+ sox_resample: false
134
+ sox_to_wav: false
135
+ trim_sil: false
136
+ txt_processor: en
137
+ use_tone: true
138
+ pre_align_cls: ''
139
+ predictor_dropout: 0.5
140
+ predictor_grad: 0.1
141
+ predictor_hidden: -1
142
+ predictor_kernel: 5
143
+ predictor_layers: 2
144
+ pretrain_fs_ckpt: ''
145
+ print_nan_grads: false
146
+ processed_data_dir: data/processed/LJSpeech
147
+ profile_infer: false
148
+ raw_data_dir: data/raw/LJSpeech
149
+ ref_hidden_stride_kernel:
150
+ - 0,3,5
151
+ - 0,3,5
152
+ - 0,2,5
153
+ - 0,2,5
154
+ - 0,2,5
155
+ ref_level_db: 20
156
+ ref_norm_layer: bn
157
+ rename_tmux: true
158
+ residual_channels: 256
159
+ residual_layers: 20
160
+ resume_from_checkpoint: 0
161
+ save_best: true
162
+ save_codes: []
163
+ save_f0: false
164
+ save_gt: true
165
+ schedule_type: vpsde
166
+ scheduler: rsqrt
167
+ seed: 1234
168
+ sil_add_noise: false
169
+ sort_by_len: true
170
+ spec_max: []
171
+ spec_min: []
172
+ task_cls: modules.ProDiff.task.ProDiff_task.ProDiff_Task
173
+ tb_log_interval: 100
174
+ teacher_ckpt: checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt
175
+ test_ids: []
176
+ test_input_dir: ''
177
+ test_num: 100
178
+ test_set_name: test
179
+ timesteps: 4
180
+ train_set_name: train
181
+ train_sets: ''
182
+ use_cond_disc: true
183
+ use_energy_embed: true
184
+ use_gt_dur: true
185
+ use_gt_f0: true
186
+ use_pitch_embed: true
187
+ use_pos_embed: true
188
+ use_ref_enc: false
189
+ use_spk_embed: false
190
+ use_spk_id: false
191
+ use_split_spk_id: false
192
+ use_uv: true
193
+ use_var_enc: false
194
+ val_check_interval: 2000
195
+ valid_infer_interval: 10000
196
+ valid_monitor_key: val_loss
197
+ valid_monitor_mode: min
198
+ valid_set_name: valid
199
+ var_enc_vq_codes: 64
200
+ vocoder_denoise_c: 0.0
201
+ warmup_updates: 2000
202
+ weight_decay: 0
203
+ win_size: 1024
204
+ word_size: 30000
205
+ work_dir: checkpoints/ProDiff
checkpoints/ProDiff/model_ckpt_steps_200000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cc8aad355c297b010e2c362341f736b3477744af76e02f6c9965409a7e9113a
3
+ size 349055740
checkpoints/ProDiff_Teacher/config.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ amp: false
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 22050
5
+ base_config:
6
+ - ./base.yaml
7
+ binarization_args:
8
+ reset_phone_dict: true
9
+ reset_word_dict: true
10
+ shuffle: false
11
+ trim_eos_bos: false
12
+ trim_sil: false
13
+ with_align: true
14
+ with_f0: true
15
+ with_f0cwt: false
16
+ with_linear: false
17
+ with_spk_embed: false
18
+ with_spk_id: true
19
+ with_txt: true
20
+ with_wav: false
21
+ with_word: true
22
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
23
+ binary_data_dir: data/binary/LJSpeech
24
+ check_val_every_n_epoch: 10
25
+ clip_grad_norm: 1
26
+ clip_grad_value: 0
27
+ conv_use_pos: false
28
+ cwt_add_f0_loss: false
29
+ cwt_hidden_size: 128
30
+ cwt_layers: 2
31
+ cwt_loss: l1
32
+ cwt_std_scale: 0.8
33
+ debug: false
34
+ dec_dilations:
35
+ - 1
36
+ - 1
37
+ - 1
38
+ - 1
39
+ dec_ffn_kernel_size: 9
40
+ dec_inp_add_noise: false
41
+ dec_kernel_size: 5
42
+ dec_layers: 4
43
+ dec_num_heads: 2
44
+ decoder_rnn_dim: 0
45
+ decoder_type: fft
46
+ dict_dir: ''
47
+ diff_decoder_type: wavenet
48
+ diff_loss_type: l1
49
+ dilation_cycle_length: 1
50
+ dropout: 0.1
51
+ ds_workers: 2
52
+ dur_enc_hidden_stride_kernel:
53
+ - 0,2,3
54
+ - 0,2,3
55
+ - 0,1,3
56
+ dur_loss: mse
57
+ dur_predictor_kernel: 3
58
+ dur_predictor_layers: 2
59
+ enc_dec_norm: ln
60
+ enc_dilations:
61
+ - 1
62
+ - 1
63
+ - 1
64
+ - 1
65
+ enc_ffn_kernel_size: 9
66
+ enc_kernel_size: 5
67
+ enc_layers: 4
68
+ encoder_K: 8
69
+ encoder_type: fft
70
+ endless_ds: true
71
+ ffn_act: gelu
72
+ ffn_hidden_size: 1024
73
+ ffn_padding: SAME
74
+ fft_size: 1024
75
+ fmax: 7600
76
+ fmin: 80
77
+ frames_multiple: 1
78
+ gen_dir_name: ''
79
+ gen_tgt_spk_id: -1
80
+ griffin_lim_iters: 60
81
+ hidden_size: 256
82
+ hop_size: 256
83
+ infer: false
84
+ keep_bins: 80
85
+ lambda_commit: 0.25
86
+ lambda_energy: 0.1
87
+ lambda_f0: 1.0
88
+ lambda_ph_dur: 0.1
89
+ lambda_sent_dur: 1.0
90
+ lambda_uv: 1.0
91
+ lambda_word_dur: 1.0
92
+ layers_in_block: 2
93
+ load_ckpt: ''
94
+ loud_norm: false
95
+ lr: 1.0
96
+ max_beta: 0.06
97
+ max_epochs: 1000
98
+ max_frames: 1548
99
+ max_input_tokens: 1550
100
+ max_sentences: 48
101
+ max_tokens: 32000
102
+ max_updates: 200000
103
+ max_valid_sentences: 1
104
+ max_valid_tokens: 60000
105
+ mel_loss: ssim:0.5|l1:0.5
106
+ mel_vmax: 1.5
107
+ mel_vmin: -6
108
+ min_frames: 0
109
+ min_level_db: -100
110
+ num_ckpt_keep: 3
111
+ num_heads: 2
112
+ num_sanity_val_steps: -1
113
+ num_spk: 1
114
+ num_test_samples: 20
115
+ num_valid_plots: 10
116
+ optimizer_adam_beta1: 0.9
117
+ optimizer_adam_beta2: 0.98
118
+ out_wav_norm: false
119
+ pitch_ar: false
120
+ pitch_embed_type: 0
121
+ pitch_enc_hidden_stride_kernel:
122
+ - 0,2,5
123
+ - 0,2,5
124
+ - 0,2,5
125
+ pitch_extractor: parselmouth
126
+ pitch_loss: l1
127
+ pitch_norm: standard
128
+ pitch_ssim_win: 11
129
+ pitch_type: frame
130
+ pre_align_args:
131
+ allow_no_txt: false
132
+ denoise: false
133
+ sox_resample: false
134
+ sox_to_wav: false
135
+ trim_sil: false
136
+ txt_processor: en
137
+ use_tone: true
138
+ pre_align_cls: egs.datasets.audio.lj.pre_align.LJPreAlign
139
+ predictor_dropout: 0.5
140
+ predictor_grad: 0.1
141
+ predictor_hidden: -1
142
+ predictor_kernel: 5
143
+ predictor_layers: 2
144
+ pretrain_fs_ckpt: ''
145
+ print_nan_grads: false
146
+ processed_data_dir: data/processed/LJSpeech
147
+ profile_infer: false
148
+ raw_data_dir: data/raw/LJSpeech
149
+ ref_hidden_stride_kernel:
150
+ - 0,3,5
151
+ - 0,3,5
152
+ - 0,2,5
153
+ - 0,2,5
154
+ - 0,2,5
155
+ ref_level_db: 20
156
+ ref_norm_layer: bn
157
+ rename_tmux: true
158
+ residual_channels: 256
159
+ residual_layers: 20
160
+ resume_from_checkpoint: 0
161
+ save_best: true
162
+ save_codes: []
163
+ save_f0: false
164
+ save_gt: true
165
+ schedule_type: vpsde
166
+ scheduler: rsqrt
167
+ seed: 1234
168
+ sil_add_noise: false
169
+ sort_by_len: true
170
+ spec_max: []
171
+ spec_min: []
172
+ task_cls: modules.ProDiff.task.ProDiff_teacher_task.ProDiff_teacher_Task
173
+ tb_log_interval: 100
174
+ test_ids: []
175
+ test_input_dir: ''
176
+ test_num: 100
177
+ test_set_name: test
178
+ timescale: 1
179
+ timesteps: 4
180
+ train_set_name: train
181
+ train_sets: ''
182
+ use_cond_disc: true
183
+ use_energy_embed: true
184
+ use_gt_dur: true
185
+ use_gt_f0: true
186
+ use_pitch_embed: true
187
+ use_pos_embed: true
188
+ use_ref_enc: false
189
+ use_spk_embed: false
190
+ use_spk_id: false
191
+ use_split_spk_id: false
192
+ use_uv: true
193
+ use_var_enc: false
194
+ val_check_interval: 2000
195
+ valid_infer_interval: 10000
196
+ valid_monitor_key: val_loss
197
+ valid_monitor_mode: min
198
+ valid_set_name: valid
199
+ var_enc_vq_codes: 64
200
+ vocoder_denoise_c: 0.0
201
+ warmup_updates: 2000
202
+ weight_decay: 0
203
+ win_size: 1024
204
+ word_size: 30000
205
+ work_dir: checkpoints/ProDiff_Teacher1
checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d3d02a215431c69dd54c1413b9a02cdc32795e2039ad9be857b12e85c470eea
3
+ size 342252871
data/binary/LJSpeech/phone_set.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
data/binary/LJSpeech/spk_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"SPK1": 0}
data/binary/LJSpeech/train_f0s_mean_std.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8790d5a84d77143690ae71a1f1e7fc81359e69ead263dc440366f2164c739efd
3
+ size 144
data_gen/tts/base_binarizer.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["OMP_NUM_THREADS"] = "1"
3
+
4
+ from utils.multiprocess_utils import chunked_multiprocess_run
5
+ import random
6
+ import traceback
7
+ import json
8
+ from resemblyzer import VoiceEncoder
9
+ from tqdm import tqdm
10
+ from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
11
+ from utils.hparams import set_hparams, hparams
12
+ import numpy as np
13
+ from utils.indexed_datasets import IndexedDatasetBuilder
14
+ from vocoders.base_vocoder import VOCODERS
15
+ import pandas as pd
16
+
17
+
18
+ class BinarizationError(Exception):
19
+ pass
20
+
21
+
22
+ class BaseBinarizer:
23
+ def __init__(self, processed_data_dir=None):
24
+ if processed_data_dir is None:
25
+ processed_data_dir = hparams['processed_data_dir']
26
+ self.processed_data_dirs = processed_data_dir.split(",")
27
+ self.binarization_args = hparams['binarization_args']
28
+ self.pre_align_args = hparams['pre_align_args']
29
+ self.forced_align = self.pre_align_args['forced_align']
30
+ tg_dir = None
31
+ if self.forced_align == 'mfa':
32
+ tg_dir = 'mfa_outputs'
33
+ if self.forced_align == 'kaldi':
34
+ tg_dir = 'kaldi_outputs'
35
+ self.item2txt = {}
36
+ self.item2ph = {}
37
+ self.item2wavfn = {}
38
+ self.item2tgfn = {}
39
+ self.item2spk = {}
40
+ for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
41
+ self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
42
+ for r_idx, r in self.meta_df.iterrows():
43
+ item_name = raw_item_name = r['item_name']
44
+ if len(self.processed_data_dirs) > 1:
45
+ item_name = f'ds{ds_id}_{item_name}'
46
+ self.item2txt[item_name] = r['txt']
47
+ self.item2ph[item_name] = r['ph']
48
+ self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
49
+ self.item2spk[item_name] = r.get('spk', 'SPK1')
50
+ if len(self.processed_data_dirs) > 1:
51
+ self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
52
+ if tg_dir is not None:
53
+ self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
54
+ self.item_names = sorted(list(self.item2txt.keys()))
55
+ if self.binarization_args['shuffle']:
56
+ random.seed(1234)
57
+ random.shuffle(self.item_names)
58
+
59
+ @property
60
+ def train_item_names(self):
61
+ return self.item_names[hparams['test_num']+hparams['valid_num']:]
62
+
63
+ @property
64
+ def valid_item_names(self):
65
+ return self.item_names[0: hparams['test_num']+hparams['valid_num']] #
66
+
67
+ @property
68
+ def test_item_names(self):
69
+ return self.item_names[0: hparams['test_num']] # Audios for MOS testing are in 'test_ids'
70
+
71
+ def build_spk_map(self):
72
+ spk_map = set()
73
+ for item_name in self.item_names:
74
+ spk_name = self.item2spk[item_name]
75
+ spk_map.add(spk_name)
76
+ spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
77
+ assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
78
+ return spk_map
79
+
80
+ def item_name2spk_id(self, item_name):
81
+ return self.spk_map[self.item2spk[item_name]]
82
+
83
+ def _phone_encoder(self):
84
+ ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
85
+ ph_set = []
86
+ if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
87
+ for processed_data_dir in self.processed_data_dirs:
88
+ ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
89
+ ph_set = sorted(set(ph_set))
90
+ json.dump(ph_set, open(ph_set_fn, 'w'))
91
+ else:
92
+ ph_set = json.load(open(ph_set_fn, 'r'))
93
+ print("| phone set: ", ph_set)
94
+ return build_phone_encoder(hparams['binary_data_dir'])
95
+
96
+ def meta_data(self, prefix):
97
+ if prefix == 'valid':
98
+ item_names = self.valid_item_names
99
+ elif prefix == 'test':
100
+ item_names = self.test_item_names
101
+ else:
102
+ item_names = self.train_item_names
103
+ for item_name in item_names:
104
+ ph = self.item2ph[item_name]
105
+ txt = self.item2txt[item_name]
106
+ tg_fn = self.item2tgfn.get(item_name)
107
+ wav_fn = self.item2wavfn[item_name]
108
+ spk_id = self.item_name2spk_id(item_name)
109
+ yield item_name, ph, txt, tg_fn, wav_fn, spk_id
110
+
111
+ def process(self):
112
+ os.makedirs(hparams['binary_data_dir'], exist_ok=True)
113
+ self.spk_map = self.build_spk_map()
114
+ print("| spk_map: ", self.spk_map)
115
+ spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
116
+ json.dump(self.spk_map, open(spk_map_fn, 'w'))
117
+
118
+ self.phone_encoder = self._phone_encoder()
119
+ self.process_data('valid')
120
+ self.process_data('test')
121
+ self.process_data('train')
122
+
123
+ def process_data(self, prefix):
124
+ data_dir = hparams['binary_data_dir']
125
+ args = []
126
+ builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
127
+ lengths = []
128
+ f0s = []
129
+ total_sec = 0
130
+ if self.binarization_args['with_spk_embed']:
131
+ voice_encoder = VoiceEncoder().cuda()
132
+
133
+ meta_data = list(self.meta_data(prefix))
134
+ for m in meta_data:
135
+ args.append(list(m) + [self.phone_encoder, self.binarization_args])
136
+ num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
137
+ for f_id, (_, item) in enumerate(
138
+ zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
139
+ if item is None:
140
+ continue
141
+ item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
142
+ if self.binarization_args['with_spk_embed'] else None
143
+ if not self.binarization_args['with_wav'] and 'wav' in item:
144
+ print("del wav")
145
+ del item['wav']
146
+ builder.add_item(item)
147
+ lengths.append(item['len'])
148
+ total_sec += item['sec']
149
+ if item.get('f0') is not None:
150
+ f0s.append(item['f0'])
151
+ builder.finalize()
152
+ np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
153
+ if len(f0s) > 0:
154
+ f0s = np.concatenate(f0s, 0)
155
+ f0s = f0s[f0s != 0]
156
+ np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
157
+ print(f"| {prefix} total duration: {total_sec:.3f}s")
158
+
159
+ @classmethod
160
+ def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
161
+ if hparams['vocoder'] in VOCODERS:
162
+ wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
163
+ else:
164
+ wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
165
+ res = {
166
+ 'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
167
+ 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
168
+ }
169
+ try:
170
+ if binarization_args['with_f0']:
171
+ cls.get_pitch(wav, mel, res)
172
+ if binarization_args['with_f0cwt']:
173
+ cls.get_f0cwt(res['f0'], res)
174
+ if binarization_args['with_txt']:
175
+ try:
176
+ phone_encoded = res['phone'] = encoder.encode(ph)
177
+ except:
178
+ traceback.print_exc()
179
+ raise BinarizationError(f"Empty phoneme")
180
+ if binarization_args['with_align']:
181
+ cls.get_align(tg_fn, ph, mel, phone_encoded, res)
182
+ except BinarizationError as e:
183
+ print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
184
+ return None
185
+ return res
186
+
187
+ @staticmethod
188
+ def get_align(tg_fn, ph, mel, phone_encoded, res):
189
+ if tg_fn is not None and os.path.exists(tg_fn):
190
+ mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
191
+ else:
192
+ raise BinarizationError(f"Align not found")
193
+ if mel2ph.max() - 1 >= len(phone_encoded):
194
+ raise BinarizationError(
195
+ f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
196
+ res['mel2ph'] = mel2ph
197
+ res['dur'] = dur
198
+
199
+ @staticmethod
200
+ def get_pitch(wav, mel, res):
201
+ f0, pitch_coarse = get_pitch(wav, mel, hparams)
202
+ if sum(f0) == 0:
203
+ raise BinarizationError("Empty f0")
204
+ res['f0'] = f0
205
+ res['pitch'] = pitch_coarse
206
+
207
+ @staticmethod
208
+ def get_f0cwt(f0, res):
209
+ from utils.cwt import get_cont_lf0, get_lf0_cwt
210
+ uv, cont_lf0_lpf = get_cont_lf0(f0)
211
+ logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
212
+ cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
213
+ Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
214
+ if np.any(np.isnan(Wavelet_lf0)):
215
+ raise BinarizationError("NaN CWT")
216
+ res['cwt_spec'] = Wavelet_lf0
217
+ res['cwt_scales'] = scales
218
+ res['f0_mean'] = logf0s_mean_org
219
+ res['f0_std'] = logf0s_std_org
220
+
221
+
222
+ if __name__ == "__main__":
223
+ set_hparams()
224
+ BaseBinarizer().process()
data_gen/tts/base_preprocess.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+ import re
5
+ import traceback
6
+ from collections import Counter
7
+ from functools import partial
8
+
9
+ import librosa
10
+ from tqdm import tqdm
11
+ from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
12
+ from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
13
+ from utils.hparams import hparams
14
+ from utils.multiprocess_utils import multiprocess_run_tqdm
15
+ from utils.os_utils import link_file, move_file, remove_file
16
+ from data_gen.tts.data_gen_utils import is_sil_phoneme, build_token_encoder
17
+
18
+
19
+ class BasePreprocessor:
20
+ def __init__(self):
21
+ self.preprocess_args = hparams['preprocess_args']
22
+ txt_processor = self.preprocess_args['txt_processor']
23
+ self.txt_processor = get_txt_processor_cls(txt_processor)
24
+ self.raw_data_dir = hparams['raw_data_dir']
25
+ self.processed_dir = hparams['processed_data_dir']
26
+ self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
27
+
28
+ def meta_data(self):
29
+ """
30
+ :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
31
+ """
32
+ raise NotImplementedError
33
+
34
+ def process(self):
35
+ processed_dir = self.processed_dir
36
+ wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
37
+ remove_file(wav_processed_tmp_dir)
38
+ os.makedirs(wav_processed_tmp_dir, exist_ok=True)
39
+ wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
40
+ remove_file(wav_processed_dir)
41
+ os.makedirs(wav_processed_dir, exist_ok=True)
42
+
43
+ meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
44
+ item_names = [d['item_name'] for d in meta_data]
45
+ assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
46
+
47
+ # preprocess data
48
+ phone_list = []
49
+ word_list = []
50
+ spk_names = set()
51
+ process_item = partial(self.preprocess_first_pass,
52
+ txt_processor=self.txt_processor,
53
+ wav_processed_dir=wav_processed_dir,
54
+ wav_processed_tmp=wav_processed_tmp_dir,
55
+ preprocess_args=self.preprocess_args)
56
+ items = []
57
+ args = [{
58
+ 'item_name': item_raw['item_name'],
59
+ 'txt_raw': item_raw['txt'],
60
+ 'wav_fn': item_raw['wav_fn'],
61
+ 'txt_loader': item_raw.get('txt_loader'),
62
+ 'others': item_raw.get('others', None)
63
+ } for item_raw in meta_data]
64
+ for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
65
+ if item is not None:
66
+ item_.update(item)
67
+ item = item_
68
+ if 'txt_loader' in item:
69
+ del item['txt_loader']
70
+ item['id'] = item_id
71
+ item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
72
+ item['others'] = item.get('others', None)
73
+ phone_list += item['ph'].split(" ")
74
+ word_list += item['word'].split(" ")
75
+ spk_names.add(item['spk_name'])
76
+ items.append(item)
77
+
78
+ # add encoded tokens
79
+ ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
80
+ spk_map = self.build_spk_map(spk_names)
81
+ args = [{
82
+ 'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
83
+ 'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
84
+ } for item in items]
85
+ for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
86
+ items[idx].update(item_new_kv)
87
+
88
+ # build mfa data
89
+ if self.preprocess_args['use_mfa']:
90
+ mfa_dict = set()
91
+ mfa_input_dir = f'{processed_dir}/mfa_inputs'
92
+ remove_file(mfa_input_dir)
93
+ # group MFA inputs for better parallelism
94
+ mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
95
+ if self.preprocess_args['mfa_group_shuffle']:
96
+ random.seed(hparams['seed'])
97
+ random.shuffle(mfa_groups)
98
+ args = [{
99
+ 'item': item, 'mfa_input_dir': mfa_input_dir,
100
+ 'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
101
+ 'preprocess_args': self.preprocess_args
102
+ } for item, mfa_group in zip(items, mfa_groups)]
103
+ for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
104
+ self.build_mfa_inputs, args, desc='Build MFA data'):
105
+ items[i]['wav_align_fn'] = new_wav_align_fn
106
+ for w in ph_gb_word_nosil.split(" "):
107
+ mfa_dict.add(f"{w} {w.replace('_', ' ')}")
108
+ mfa_dict = sorted(mfa_dict)
109
+ with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
110
+ f.writelines([f'{l}\n' for l in mfa_dict])
111
+ with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
112
+ f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
113
+ remove_file(wav_processed_tmp_dir)
114
+
115
+ @classmethod
116
+ def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
117
+ wav_fn, wav_processed_dir, wav_processed_tmp,
118
+ preprocess_args, txt_loader=None, others=None):
119
+ try:
120
+ if txt_loader is not None:
121
+ txt_raw = txt_loader(txt_raw)
122
+ ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
123
+ wav_fn, wav_align_fn = cls.process_wav(
124
+ item_name, wav_fn,
125
+ hparams['processed_data_dir'],
126
+ wav_processed_tmp, preprocess_args)
127
+
128
+ # wav for binarization
129
+ ext = os.path.splitext(wav_fn)[1]
130
+ os.makedirs(wav_processed_dir, exist_ok=True)
131
+ new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
132
+ move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
133
+ move_link_func(wav_fn, new_wav_fn)
134
+ return {
135
+ 'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
136
+ 'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
137
+ 'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
138
+ 'others': others
139
+ }
140
+ except:
141
+ traceback.print_exc()
142
+ print(f"| Error is caught. item_name: {item_name}.")
143
+ return None
144
+
145
+ @staticmethod
146
+ def txt_to_ph(txt_processor, txt_raw, preprocess_args):
147
+ txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
148
+ ph = [p for w in txt_struct for p in w[1]]
149
+ return " ".join(ph), txt
150
+
151
+ @staticmethod
152
+ def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
153
+ processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
154
+ processors = [k() for k in processors if k is not None]
155
+ if len(processors) >= 1:
156
+ sr_file = librosa.core.get_samplerate(wav_fn)
157
+ output_fn_for_align = None
158
+ ext = os.path.splitext(wav_fn)[1]
159
+ input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
160
+ link_file(wav_fn, input_fn)
161
+ for p in processors:
162
+ outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
163
+ if len(outputs) == 3:
164
+ input_fn, sr, output_fn_for_align = outputs
165
+ else:
166
+ input_fn, sr = outputs
167
+ return input_fn, output_fn_for_align
168
+ else:
169
+ return wav_fn, wav_fn
170
+
171
+ def _phone_encoder(self, ph_set):
172
+ ph_set_fn = f"{self.processed_dir}/phone_set.json"
173
+ if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
174
+ ph_set = sorted(set(ph_set))
175
+ json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
176
+ print("| Build phone set: ", ph_set)
177
+ else:
178
+ ph_set = json.load(open(ph_set_fn, 'r'))
179
+ print("| Load phone set: ", ph_set)
180
+ return build_token_encoder(ph_set_fn)
181
+
182
+ def _word_encoder(self, word_set):
183
+ word_set_fn = f"{self.processed_dir}/word_set.json"
184
+ if self.preprocess_args['reset_word_dict']:
185
+ word_set = Counter(word_set)
186
+ total_words = sum(word_set.values())
187
+ word_set = word_set.most_common(hparams['word_dict_size'])
188
+ num_unk_words = total_words - sum([x[1] for x in word_set])
189
+ word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
190
+ word_set = sorted(set(word_set))
191
+ json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
192
+ print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
193
+ f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
194
+ else:
195
+ word_set = json.load(open(word_set_fn, 'r'))
196
+ print("| Load word set. Size: ", len(word_set), word_set[:10])
197
+ return build_token_encoder(word_set_fn)
198
+
199
+ @classmethod
200
+ def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
201
+ word_token = word_encoder.encode(word)
202
+ ph_token = ph_encoder.encode(ph)
203
+ spk_id = spk_map[spk_name]
204
+ return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
205
+
206
+ def build_spk_map(self, spk_names):
207
+ spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
208
+ assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
209
+ print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
210
+ json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
211
+ return spk_map
212
+
213
+ @classmethod
214
+ def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
215
+ item_name = item['item_name']
216
+ wav_align_fn = item['wav_align_fn']
217
+ ph_gb_word = item['ph_gb_word']
218
+ ext = os.path.splitext(wav_align_fn)[1]
219
+ mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
220
+ os.makedirs(mfa_input_group_dir, exist_ok=True)
221
+ new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
222
+ move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
223
+ move_link_func(wav_align_fn, new_wav_align_fn)
224
+ ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
225
+ for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
226
+ with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
227
+ f_txt.write(ph_gb_word_nosil)
228
+ return ph_gb_word_nosil, new_wav_align_fn
229
+
230
+ def load_spk_map(self, base_dir):
231
+ spk_map_fn = f"{base_dir}/spk_map.json"
232
+ spk_map = json.load(open(spk_map_fn, 'r'))
233
+ return spk_map
234
+
235
+ def load_dict(self, base_dir):
236
+ ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
237
+ return ph_encoder
238
+
239
+ @property
240
+ def meta_csv_filename(self):
241
+ return 'metadata'
242
+
243
+ @property
244
+ def wav_processed_dirname(self):
245
+ return 'wav_processed'
data_gen/tts/bin/binarize.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["OMP_NUM_THREADS"] = "1"
4
+
5
+ import importlib
6
+ from utils.hparams import set_hparams, hparams
7
+
8
+
9
+ def binarize():
10
+ binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
11
+ pkg = ".".join(binarizer_cls.split(".")[:-1])
12
+ cls_name = binarizer_cls.split(".")[-1]
13
+ binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
14
+ print("| Binarizer: ", binarizer_cls)
15
+ binarizer_cls().process()
16
+
17
+
18
+ if __name__ == '__main__':
19
+ set_hparams()
20
+ binarize()
data_gen/tts/data_gen_utils.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ warnings.filterwarnings("ignore")
4
+
5
+ # import parselmouth
6
+ import os
7
+ import torch
8
+ from skimage.transform import resize
9
+ from utils.text_encoder import TokenTextEncoder
10
+ from utils.pitch_utils import f0_to_coarse
11
+ import struct
12
+ import webrtcvad
13
+ from scipy.ndimage.morphology import binary_dilation
14
+ import librosa
15
+ import numpy as np
16
+ from utils import audio
17
+ import pyloudnorm as pyln
18
+ import re
19
+ import json
20
+ from collections import OrderedDict
21
+
22
+ PUNCS = '!,.?;:'
23
+
24
+ int16_max = (2 ** 15) - 1
25
+
26
+
27
+ def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
28
+ """
29
+ Ensures that segments without voice in the waveform remain no longer than a
30
+ threshold determined by the VAD parameters in params.py.
31
+ :param wav: the raw waveform as a numpy array of floats
32
+ :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
33
+ :return: the same waveform with silences trimmed away (length <= original wav length)
34
+ """
35
+
36
+ ## Voice Activation Detection
37
+ # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
38
+ # This sets the granularity of the VAD. Should not need to be changed.
39
+ sampling_rate = 16000
40
+ wav_raw, sr = librosa.core.load(path, sr=sr)
41
+
42
+ if norm:
43
+ meter = pyln.Meter(sr) # create BS.1770 meter
44
+ loudness = meter.integrated_loudness(wav_raw)
45
+ wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
46
+ if np.abs(wav_raw).max() > 1.0:
47
+ wav_raw = wav_raw / np.abs(wav_raw).max()
48
+
49
+ wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
50
+
51
+ vad_window_length = 30 # In milliseconds
52
+ # Number of frames to average together when performing the moving average smoothing.
53
+ # The larger this value, the larger the VAD variations must be to not get smoothed out.
54
+ vad_moving_average_width = 8
55
+
56
+ # Compute the voice detection window size
57
+ samples_per_window = (vad_window_length * sampling_rate) // 1000
58
+
59
+ # Trim the end of the audio to have a multiple of the window size
60
+ wav = wav[:len(wav) - (len(wav) % samples_per_window)]
61
+
62
+ # Convert the float waveform to 16-bit mono PCM
63
+ pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
64
+
65
+ # Perform voice activation detection
66
+ voice_flags = []
67
+ vad = webrtcvad.Vad(mode=3)
68
+ for window_start in range(0, len(wav), samples_per_window):
69
+ window_end = window_start + samples_per_window
70
+ voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
71
+ sample_rate=sampling_rate))
72
+ voice_flags = np.array(voice_flags)
73
+
74
+ # Smooth the voice detection with a moving average
75
+ def moving_average(array, width):
76
+ array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
77
+ ret = np.cumsum(array_padded, dtype=float)
78
+ ret[width:] = ret[width:] - ret[:-width]
79
+ return ret[width - 1:] / width
80
+
81
+ audio_mask = moving_average(voice_flags, vad_moving_average_width)
82
+ audio_mask = np.round(audio_mask).astype(np.bool)
83
+
84
+ # Dilate the voiced regions
85
+ audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
86
+ audio_mask = np.repeat(audio_mask, samples_per_window)
87
+ audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
88
+ if return_raw_wav:
89
+ return wav_raw, audio_mask, sr
90
+ return wav_raw[audio_mask], audio_mask, sr
91
+
92
+
93
+ def process_utterance(wav_path,
94
+ fft_size=1024,
95
+ hop_size=256,
96
+ win_length=1024,
97
+ window="hann",
98
+ num_mels=80,
99
+ fmin=80,
100
+ fmax=7600,
101
+ eps=1e-6,
102
+ sample_rate=22050,
103
+ loud_norm=False,
104
+ min_level_db=-100,
105
+ return_linear=False,
106
+ trim_long_sil=False, vocoder='pwg'):
107
+ if isinstance(wav_path, str):
108
+ if trim_long_sil:
109
+ wav, _, _ = trim_long_silences(wav_path, sample_rate)
110
+ else:
111
+ wav, _ = librosa.core.load(wav_path, sr=sample_rate)
112
+ else:
113
+ wav = wav_path
114
+
115
+ if loud_norm:
116
+ meter = pyln.Meter(sample_rate) # create BS.1770 meter
117
+ loudness = meter.integrated_loudness(wav)
118
+ wav = pyln.normalize.loudness(wav, loudness, -22.0)
119
+ if np.abs(wav).max() > 1:
120
+ wav = wav / np.abs(wav).max()
121
+
122
+ # get amplitude spectrogram
123
+ x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
124
+ win_length=win_length, window=window, pad_mode="constant")
125
+ spc = np.abs(x_stft) # (n_bins, T)
126
+
127
+ # get mel basis
128
+ fmin = 0 if fmin == -1 else fmin
129
+ fmax = sample_rate / 2 if fmax == -1 else fmax
130
+ mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
131
+ mel = mel_basis @ spc
132
+
133
+ if vocoder == 'pwg':
134
+ mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
135
+ else:
136
+ assert False, f'"{vocoder}" is not in ["pwg"].'
137
+
138
+ l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
139
+ wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
140
+ wav = wav[:mel.shape[1] * hop_size]
141
+
142
+ if not return_linear:
143
+ return wav, mel
144
+ else:
145
+ spc = audio.amp_to_db(spc)
146
+ spc = audio.normalize(spc, {'min_level_db': min_level_db})
147
+ return wav, mel, spc
148
+
149
+
150
+ def get_pitch(wav_data, mel, hparams):
151
+ """
152
+
153
+ :param wav_data: [T]
154
+ :param mel: [T, 80]
155
+ :param hparams:
156
+ :return:
157
+ """
158
+ time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
159
+ f0_min = 80
160
+ f0_max = 750
161
+
162
+ if hparams['hop_size'] == 128:
163
+ pad_size = 4
164
+ elif hparams['hop_size'] == 256:
165
+ pad_size = 2
166
+ else:
167
+ assert False
168
+
169
+ f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
170
+ time_step=time_step / 1000, voicing_threshold=0.6,
171
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
172
+ lpad = pad_size * 2
173
+ rpad = len(mel) - len(f0) - lpad
174
+ f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
175
+ # mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
176
+ # Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
177
+ # Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
178
+ delta_l = len(mel) - len(f0)
179
+ assert np.abs(delta_l) <= 8
180
+ if delta_l > 0:
181
+ f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
182
+ f0 = f0[:len(mel)]
183
+ pitch_coarse = f0_to_coarse(f0)
184
+ return f0, pitch_coarse
185
+
186
+
187
+ def remove_empty_lines(text):
188
+ """remove empty lines"""
189
+ assert (len(text) > 0)
190
+ assert (isinstance(text, list))
191
+ text = [t.strip() for t in text]
192
+ if "" in text:
193
+ text.remove("")
194
+ return text
195
+
196
+
197
+ class TextGrid(object):
198
+ def __init__(self, text):
199
+ text = remove_empty_lines(text)
200
+ self.text = text
201
+ self.line_count = 0
202
+ self._get_type()
203
+ self._get_time_intval()
204
+ self._get_size()
205
+ self.tier_list = []
206
+ self._get_item_list()
207
+
208
+ def _extract_pattern(self, pattern, inc):
209
+ """
210
+ Parameters
211
+ ----------
212
+ pattern : regex to extract pattern
213
+ inc : increment of line count after extraction
214
+ Returns
215
+ -------
216
+ group : extracted info
217
+ """
218
+ try:
219
+ group = re.match(pattern, self.text[self.line_count]).group(1)
220
+ self.line_count += inc
221
+ except AttributeError:
222
+ raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
223
+ return group
224
+
225
+ def _get_type(self):
226
+ self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
227
+
228
+ def _get_time_intval(self):
229
+ self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
230
+ self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
231
+
232
+ def _get_size(self):
233
+ self.size = int(self._extract_pattern(r"size = (.*)", 2))
234
+
235
+ def _get_item_list(self):
236
+ """Only supports IntervalTier currently"""
237
+ for itemIdx in range(1, self.size + 1):
238
+ tier = OrderedDict()
239
+ item_list = []
240
+ tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
241
+ tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
242
+ if tier_class != "IntervalTier":
243
+ raise NotImplementedError("Only IntervalTier class is supported currently")
244
+ tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
245
+ tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
246
+ tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
247
+ tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
248
+ for i in range(int(tier_size)):
249
+ item = OrderedDict()
250
+ item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
251
+ item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
252
+ item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
253
+ item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
254
+ item_list.append(item)
255
+ tier["idx"] = tier_idx
256
+ tier["class"] = tier_class
257
+ tier["name"] = tier_name
258
+ tier["xmin"] = tier_xmin
259
+ tier["xmax"] = tier_xmax
260
+ tier["size"] = tier_size
261
+ tier["items"] = item_list
262
+ self.tier_list.append(tier)
263
+
264
+ def toJson(self):
265
+ _json = OrderedDict()
266
+ _json["file_type"] = self.file_type
267
+ _json["xmin"] = self.xmin
268
+ _json["xmax"] = self.xmax
269
+ _json["size"] = self.size
270
+ _json["tiers"] = self.tier_list
271
+ return json.dumps(_json, ensure_ascii=False, indent=2)
272
+
273
+
274
+ def get_mel2ph(tg_fn, ph, mel, hparams):
275
+ ph_list = ph.split(" ")
276
+ with open(tg_fn, "r") as f:
277
+ tg = f.readlines()
278
+ tg = remove_empty_lines(tg)
279
+ tg = TextGrid(tg)
280
+ tg = json.loads(tg.toJson())
281
+ split = np.ones(len(ph_list) + 1, np.float) * -1
282
+ tg_idx = 0
283
+ ph_idx = 0
284
+ tg_align = [x for x in tg['tiers'][-1]['items']]
285
+ tg_align_ = []
286
+ for x in tg_align:
287
+ x['xmin'] = float(x['xmin'])
288
+ x['xmax'] = float(x['xmax'])
289
+ if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
290
+ x['text'] = ''
291
+ if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
292
+ tg_align_[-1]['xmax'] = x['xmax']
293
+ continue
294
+ tg_align_.append(x)
295
+ tg_align = tg_align_
296
+ tg_len = len([x for x in tg_align if x['text'] != ''])
297
+ ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
298
+ assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
299
+ while tg_idx < len(tg_align) or ph_idx < len(ph_list):
300
+ if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
301
+ split[ph_idx] = 1e8
302
+ ph_idx += 1
303
+ continue
304
+ x = tg_align[tg_idx]
305
+ if x['text'] == '' and ph_idx == len(ph_list):
306
+ tg_idx += 1
307
+ continue
308
+ assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
309
+ ph = ph_list[ph_idx]
310
+ if x['text'] == '' and not is_sil_phoneme(ph):
311
+ assert False, (ph_list, tg_align)
312
+ if x['text'] != '' and is_sil_phoneme(ph):
313
+ ph_idx += 1
314
+ else:
315
+ assert (x['text'] == '' and is_sil_phoneme(ph)) \
316
+ or x['text'].lower() == ph.lower() \
317
+ or x['text'].lower() == 'sil', (x['text'], ph)
318
+ split[ph_idx] = x['xmin']
319
+ if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
320
+ split[ph_idx - 1] = split[ph_idx]
321
+ ph_idx += 1
322
+ tg_idx += 1
323
+ assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
324
+ assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
325
+ mel2ph = np.zeros([mel.shape[0]], np.int)
326
+ split[0] = 0
327
+ split[-1] = 1e8
328
+ for i in range(len(split) - 1):
329
+ assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
330
+ split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
331
+ for ph_idx in range(len(ph_list)):
332
+ mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
333
+ mel2ph_torch = torch.from_numpy(mel2ph)
334
+ T_t = len(ph_list)
335
+ dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
336
+ dur = dur[1:].numpy()
337
+ return mel2ph, dur
338
+
339
+
340
+ def build_phone_encoder(data_dir):
341
+ phone_list_file = os.path.join(data_dir, 'phone_set.json')
342
+ phone_list = json.load(open(phone_list_file))
343
+ return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
344
+
345
+
346
+ def is_sil_phoneme(p):
347
+ return not p[0].isalpha()
348
+
349
+
350
+ def build_token_encoder(token_list_file):
351
+ token_list = json.load(open(token_list_file))
352
+ return TokenTextEncoder(None, vocab_list=token_list, replace_oov='<UNK>')
data_gen/tts/txt_processors/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from . import en
data_gen/tts/txt_processors/base_text_processor.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_gen.tts.data_gen_utils import is_sil_phoneme
2
+
3
+ REGISTERED_TEXT_PROCESSORS = {}
4
+
5
+ def register_txt_processors(name):
6
+ def _f(cls):
7
+ REGISTERED_TEXT_PROCESSORS[name] = cls
8
+ return cls
9
+
10
+ return _f
11
+
12
+
13
+ def get_txt_processor_cls(name):
14
+ return REGISTERED_TEXT_PROCESSORS.get(name, None)
15
+
16
+
17
+ class BaseTxtProcessor:
18
+ @staticmethod
19
+ def sp_phonemes():
20
+ return ['|']
21
+
22
+ @classmethod
23
+ def process(cls, txt, preprocess_args):
24
+ raise NotImplementedError
25
+
26
+ @classmethod
27
+ def postprocess(cls, txt_struct, preprocess_args):
28
+ # remove sil phoneme in head and tail
29
+ while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
30
+ txt_struct = txt_struct[1:]
31
+ while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
32
+ txt_struct = txt_struct[:-1]
33
+ if preprocess_args['with_phsep']:
34
+ txt_struct = cls.add_bdr(txt_struct)
35
+ if preprocess_args['add_eos_bos']:
36
+ txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
37
+ return txt_struct
38
+
39
+ @classmethod
40
+ def add_bdr(cls, txt_struct):
41
+ txt_struct_ = []
42
+ for i, ts in enumerate(txt_struct):
43
+ txt_struct_.append(ts)
44
+ if i != len(txt_struct) - 1 and \
45
+ not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
46
+ txt_struct_.append(['|', ['|']])
47
+ return txt_struct_
data_gen/tts/txt_processors/en.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ from g2p_en import G2p
5
+ from g2p_en.expand import normalize_numbers
6
+ from nltk import pos_tag
7
+ from nltk.tokenize import TweetTokenizer
8
+
9
+ from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
10
+ from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
11
+
12
+ class EnG2p(G2p):
13
+ word_tokenize = TweetTokenizer().tokenize
14
+
15
+ def __call__(self, text):
16
+ # preprocessing
17
+ words = EnG2p.word_tokenize(text)
18
+ tokens = pos_tag(words) # tuples of (word, tag)
19
+
20
+ # steps
21
+ prons = []
22
+ for word, pos in tokens:
23
+ if re.search("[a-z]", word) is None:
24
+ pron = [word]
25
+
26
+ elif word in self.homograph2features: # Check homograph
27
+ pron1, pron2, pos1 = self.homograph2features[word]
28
+ if pos.startswith(pos1):
29
+ pron = pron1
30
+ else:
31
+ pron = pron2
32
+ elif word in self.cmu: # lookup CMU dict
33
+ pron = self.cmu[word][0]
34
+ else: # predict for oov
35
+ pron = self.predict(word)
36
+
37
+ prons.extend(pron)
38
+ prons.extend([" "])
39
+
40
+ return prons[:-1]
41
+
42
+
43
+ @register_txt_processors('en')
44
+ class TxtProcessor(BaseTxtProcessor):
45
+ g2p = EnG2p()
46
+
47
+ @staticmethod
48
+ def preprocess_text(text):
49
+ text = normalize_numbers(text)
50
+ text = ''.join(char for char in unicodedata.normalize('NFD', text)
51
+ if unicodedata.category(char) != 'Mn') # Strip accents
52
+ text = text.lower()
53
+ text = re.sub("[\'\"()]+", "", text)
54
+ text = re.sub("[-]+", " ", text)
55
+ text = re.sub(f"[^ a-z{PUNCS}]", "", text)
56
+ text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
57
+ text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
58
+ text = text.replace("i.e.", "that is")
59
+ text = text.replace("i.e.", "that is")
60
+ text = text.replace("etc.", "etc")
61
+ text = re.sub(f"([{PUNCS}])", r" \1 ", text)
62
+ text = re.sub(rf"\s+", r" ", text)
63
+ return text
64
+
65
+ @classmethod
66
+ def process(cls, txt, preprocess_args):
67
+ txt = cls.preprocess_text(txt).strip()
68
+ phs = cls.g2p(txt)
69
+ txt_struct = [[w, []] for w in txt.split(" ")]
70
+ i_word = 0
71
+ for p in phs:
72
+ if p == ' ':
73
+ i_word += 1
74
+ else:
75
+ txt_struct[i_word][1].append(p)
76
+ txt_struct = cls.postprocess(txt_struct, preprocess_args)
77
+ return txt_struct, txt
data_gen/tts/wav_processors/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from . import base_processor
2
+ from . import common_processors
data_gen/tts/wav_processors/base_processor.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REGISTERED_WAV_PROCESSORS = {}
2
+
3
+
4
+ def register_wav_processors(name):
5
+ def _f(cls):
6
+ REGISTERED_WAV_PROCESSORS[name] = cls
7
+ return cls
8
+
9
+ return _f
10
+
11
+
12
+ def get_wav_processor_cls(name):
13
+ return REGISTERED_WAV_PROCESSORS.get(name, None)
14
+
15
+
16
+ class BaseWavProcessor:
17
+ @property
18
+ def name(self):
19
+ raise NotImplementedError
20
+
21
+ def output_fn(self, input_fn):
22
+ return f'{input_fn[:-4]}_{self.name}.wav'
23
+
24
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
25
+ raise NotImplementedError
data_gen/tts/wav_processors/common_processors.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import librosa
4
+ import numpy as np
5
+ from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
6
+ from data_gen.tts.data_gen_utils import trim_long_silences
7
+ from utils.audio import save_wav
8
+ from utils.rnnoise import rnnoise
9
+ from utils.hparams import hparams
10
+
11
+
12
+ @register_wav_processors(name='sox_to_wav')
13
+ class ConvertToWavProcessor(BaseWavProcessor):
14
+ @property
15
+ def name(self):
16
+ return 'ToWav'
17
+
18
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
19
+ if input_fn[-4:] == '.wav':
20
+ return input_fn, sr
21
+ else:
22
+ output_fn = self.output_fn(input_fn)
23
+ subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
24
+ return output_fn, sr
25
+
26
+
27
+ @register_wav_processors(name='sox_resample')
28
+ class ResampleProcessor(BaseWavProcessor):
29
+ @property
30
+ def name(self):
31
+ return 'Resample'
32
+
33
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
34
+ output_fn = self.output_fn(input_fn)
35
+ sr_file = librosa.core.get_samplerate(input_fn)
36
+ if sr != sr_file:
37
+ subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
38
+ y, _ = librosa.core.load(input_fn, sr=sr)
39
+ y, _ = librosa.effects.trim(y)
40
+ save_wav(y, output_fn, sr)
41
+ return output_fn, sr
42
+ else:
43
+ return input_fn, sr
44
+
45
+
46
+ @register_wav_processors(name='trim_sil')
47
+ class TrimSILProcessor(BaseWavProcessor):
48
+ @property
49
+ def name(self):
50
+ return 'TrimSIL'
51
+
52
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
53
+ output_fn = self.output_fn(input_fn)
54
+ y, _ = librosa.core.load(input_fn, sr=sr)
55
+ y, _ = librosa.effects.trim(y)
56
+ save_wav(y, output_fn, sr)
57
+ return output_fn
58
+
59
+
60
+ @register_wav_processors(name='trim_all_sil')
61
+ class TrimAllSILProcessor(BaseWavProcessor):
62
+ @property
63
+ def name(self):
64
+ return 'TrimSIL'
65
+
66
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
67
+ output_fn = self.output_fn(input_fn)
68
+ y, audio_mask, _ = trim_long_silences(
69
+ input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
70
+ save_wav(y, output_fn, sr)
71
+ if preprocess_args['save_sil_mask']:
72
+ os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
73
+ np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
74
+ return output_fn, sr
75
+
76
+
77
+ @register_wav_processors(name='denoise')
78
+ class DenoiseProcessor(BaseWavProcessor):
79
+ @property
80
+ def name(self):
81
+ return 'Denoise'
82
+
83
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
84
+ output_fn = self.output_fn(input_fn)
85
+ rnnoise(input_fn, output_fn, out_sample_rate=sr)
86
+ return output_fn, sr
egs/datasets/audio/libritts/base_text2mel.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ raw_data_dir: 'data/raw/LibriTTS'
2
+ processed_data_dir: 'data/processed/libritts'
3
+ binary_data_dir: 'data/binary/libritts'
4
+ pre_align_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
5
+ binarization_args:
6
+ shuffle: true
7
+ use_spk_id: true
8
+ test_num: 200
9
+ num_spk: 2320
10
+ pitch_type: frame
11
+ min_frames: 128
12
+ num_test_samples: 30
13
+ mel_loss: "ssim:0.5|l1:0.5"
14
+ vocoder_ckpt: ''
egs/datasets/audio/libritts/fs2.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/fs2.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/libritts/pre_align.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from data_gen.tts.base_pre_align import BasePreAlign
4
+ import glob
5
+
6
+
7
+ class LibrittsPreAlign(BasePreAlign):
8
+ def meta_data(self):
9
+ wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*/*.wav'))
10
+ for wav_fn in wav_fns:
11
+ item_name = os.path.basename(wav_fn)[:-4]
12
+ txt_fn = f'{wav_fn[:-4]}.normalized.txt'
13
+ spk = item_name.split("_")[0]
14
+ yield item_name, wav_fn, (self.load_txt, txt_fn), spk
15
+
16
+
17
+ if __name__ == "__main__":
18
+ LibrittsPreAlign().process()
egs/datasets/audio/libritts/pwg.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ base_config: egs/egs_bases/tts/vocoder/pwg.yaml
2
+ raw_data_dir: 'data/raw/LibriTTS'
3
+ processed_data_dir: 'data/processed/libritts'
4
+ binary_data_dir: 'data/binary/libritts_wav'
5
+ generator_params:
6
+ kernel_size: 5
7
+ num_spk: 400
8
+ max_samples: 20480
egs/datasets/audio/lj/base_mel2wav.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
2
+ processed_data_dir: 'data/processed/ljspeech'
3
+ binary_data_dir: 'data/binary/ljspeech_wav'
4
+ binarization_args:
5
+ with_spk_embed: false
egs/datasets/audio/lj/pre_align.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_gen.tts.base_preprocess import BasePreprocessor
2
+
3
+
4
+ class LJPreAlign(BasePreprocessor):
5
+ def meta_data(self):
6
+ for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
7
+ item_name, _, txt = l.strip().split("|")
8
+ wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
9
+ yield item_name, wav_fn, txt, 'SPK1'
10
+
11
+
12
+ if __name__ == "__main__":
13
+ LJPreAlign().process()
egs/datasets/audio/lj/pwg.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/vocoder/pwg.yaml
3
+ - ./base_mel2wav.yaml
egs/datasets/audio/vctk/base_mel2wav.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ raw_data_dir: 'data/raw/VCTK-Corpus'
2
+ processed_data_dir: 'data/processed/vctk'
3
+ binary_data_dir: 'data/binary/vctk_wav'
egs/datasets/audio/vctk/fs2.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/fs2.yaml
3
+ raw_data_dir: 'data/raw/VCTK-Corpus'
4
+ processed_data_dir: 'data/processed/vctk'
5
+ binary_data_dir: 'data/binary/vctk'
6
+ pre_align_cls: egs.datasets.audio.vctk.pre_align.VCTKPreAlign
7
+ use_spk_id: true
8
+ test_num: 200
9
+ num_spk: 400
10
+ binarization_args:
11
+ shuffle: true
12
+ trim_eos_bos: true
egs/datasets/audio/vctk/pre_align.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from data_gen.tts.base_pre_align import BasePreAlign
4
+ import glob
5
+
6
+
7
+ class VCTKPreAlign(BasePreAlign):
8
+ def meta_data(self):
9
+ wav_fns = glob.glob(f'{self.raw_data_dir}/wav48/*/*.wav')
10
+ for wav_fn in wav_fns:
11
+ item_name = os.path.basename(wav_fn)[:-4]
12
+ spk = item_name.split("_")[0]
13
+ txt_fn = wav_fn.split("/")
14
+ txt_fn[-1] = f'{item_name}.txt'
15
+ txt_fn[-3] = f'txt'
16
+ txt_fn = "/".join(txt_fn)
17
+ if os.path.exists(txt_fn) and os.path.exists(wav_fn):
18
+ yield item_name, wav_fn, (self.load_txt, txt_fn), spk
19
+
20
+
21
+ if __name__ == "__main__":
22
+ VCTKPreAlign().process()
egs/datasets/audio/vctk/pwg.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/vocoder/pwg.yaml
3
+ - ./base_mel2wav.yaml
4
+
5
+ num_spk: 400
6
+ max_samples: 20480
egs/egs_bases/config_base.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # task
2
+ binary_data_dir: ''
3
+ work_dir: '' # experiment directory.
4
+ infer: false # inference
5
+ amp: false
6
+ seed: 1234
7
+ debug: false
8
+ save_codes: []
9
+ # - configs
10
+ # - modules
11
+ # - tasks
12
+ # - utils
13
+ # - usr
14
+
15
+ #############
16
+ # dataset
17
+ #############
18
+ ds_workers: 1
19
+ test_num: 100
20
+ endless_ds: false
21
+ sort_by_len: true
22
+
23
+ #########
24
+ # train and eval
25
+ #########
26
+ print_nan_grads: false
27
+ load_ckpt: ''
28
+ save_best: true
29
+ num_ckpt_keep: 3
30
+ clip_grad_norm: 0
31
+ accumulate_grad_batches: 1
32
+ tb_log_interval: 100
33
+ num_sanity_val_steps: 5 # steps of validation at the beginning
34
+ check_val_every_n_epoch: 10
35
+ val_check_interval: 2000
36
+ valid_monitor_key: 'val_loss'
37
+ valid_monitor_mode: 'min'
38
+ max_epochs: 1000
39
+ max_updates: 1000000
40
+ max_tokens: 31250
41
+ max_sentences: 100000
42
+ max_valid_tokens: -1
43
+ max_valid_sentences: -1
44
+ test_input_dir: ''
45
+ resume_from_checkpoint: 0
46
+ rename_tmux: true
egs/egs_bases/tts/base.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # task
2
+ base_config: ../config_base.yaml
3
+ task_cls: ''
4
+ #############
5
+ # dataset
6
+ #############
7
+ raw_data_dir: ''
8
+ processed_data_dir: ''
9
+ binary_data_dir: ''
10
+ dict_dir: ''
11
+ pre_align_cls: ''
12
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
13
+ pre_align_args:
14
+ txt_processor: en
15
+ use_tone: true # for ZH
16
+ sox_resample: false
17
+ sox_to_wav: false
18
+ allow_no_txt: false
19
+ trim_sil: false
20
+ denoise: false
21
+ binarization_args:
22
+ shuffle: false
23
+ with_txt: true
24
+ with_wav: false
25
+ with_align: true
26
+ with_spk_embed: false
27
+ with_spk_id: true
28
+ with_f0: true
29
+ with_f0cwt: false
30
+ with_linear: false
31
+ with_word: true
32
+ trim_sil: false
33
+ trim_eos_bos: false
34
+ reset_phone_dict: true
35
+ reset_word_dict: true
36
+ word_size: 30000
37
+ pitch_extractor: parselmouth
38
+
39
+ loud_norm: false
40
+ endless_ds: true
41
+
42
+ test_num: 100
43
+ min_frames: 0
44
+ max_frames: 1548
45
+ frames_multiple: 1
46
+ max_input_tokens: 1550
47
+ audio_num_mel_bins: 80
48
+ audio_sample_rate: 22050
49
+ hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
50
+ win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
51
+ fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
52
+ fmax: 7600 # To be increased/reduced depending on data.
53
+ fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
54
+ min_level_db: -100
55
+ ref_level_db: 20
56
+ griffin_lim_iters: 60
57
+ num_spk: 1
58
+ mel_vmin: -6
59
+ mel_vmax: 1.5
60
+ ds_workers: 1
61
+
62
+ #########
63
+ # model
64
+ #########
65
+ dropout: 0.1
66
+ enc_layers: 4
67
+ dec_layers: 4
68
+ hidden_size: 256
69
+ num_heads: 2
70
+ enc_ffn_kernel_size: 9
71
+ dec_ffn_kernel_size: 9
72
+ ffn_act: gelu
73
+ ffn_padding: 'SAME'
74
+ use_spk_id: false
75
+ use_split_spk_id: false
76
+ use_spk_embed: false
77
+
78
+
79
+ ###########
80
+ # optimization
81
+ ###########
82
+ lr: 2.0
83
+ scheduler: rsqrt # rsqrt|none
84
+ warmup_updates: 8000
85
+ optimizer_adam_beta1: 0.9
86
+ optimizer_adam_beta2: 0.98
87
+ weight_decay: 0
88
+ clip_grad_norm: 1
89
+ clip_grad_value: 0
90
+
91
+
92
+ ###########
93
+ # train and eval
94
+ ###########
95
+ max_tokens: 30000
96
+ max_sentences: 100000
97
+ max_valid_sentences: 1
98
+ max_valid_tokens: 60000
99
+ valid_infer_interval: 10000
100
+ train_set_name: 'train'
101
+ train_sets: ''
102
+ valid_set_name: 'valid'
103
+ test_set_name: 'test'
104
+ num_test_samples: 0
105
+ num_valid_plots: 10
106
+ test_ids: [ ]
107
+ vocoder_denoise_c: 0.0
108
+ profile_infer: false
109
+ out_wav_norm: false
110
+ save_gt: true
111
+ save_f0: false
112
+ gen_dir_name: ''
egs/egs_bases/tts/fs2.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./base.yaml
2
+ task_cls: tasks.tts.fs2.FastSpeech2Task
3
+
4
+ # model
5
+ hidden_size: 256
6
+ dropout: 0.1
7
+ encoder_type: fft # rel_fft|fft|tacotron|tacotron2|conformer
8
+ decoder_type: fft # fft|rnn|conv|conformer|wn
9
+
10
+ # rnn enc/dec
11
+ encoder_K: 8
12
+ decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
13
+
14
+ # fft enc/dec
15
+ use_pos_embed: true
16
+ dec_num_heads: 2
17
+ dec_layers: 4
18
+ ffn_hidden_size: 1024
19
+ enc_ffn_kernel_size: 9
20
+ dec_ffn_kernel_size: 9
21
+
22
+ # conv enc/dec
23
+ enc_dec_norm: ln
24
+ conv_use_pos: false
25
+ layers_in_block: 2
26
+ enc_dilations: [ 1, 1, 1, 1 ]
27
+ enc_kernel_size: 5
28
+ dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
29
+ dec_kernel_size: 5
30
+ dur_loss: mse # huber|mol
31
+
32
+ # duration
33
+ predictor_hidden: -1
34
+ predictor_kernel: 5
35
+ predictor_layers: 2
36
+ dur_predictor_kernel: 3
37
+ dur_predictor_layers: 2
38
+ predictor_dropout: 0.5
39
+
40
+ # pitch and energy
41
+ pitch_norm: standard # standard|log
42
+ use_pitch_embed: true
43
+ pitch_type: frame # frame|ph|cwt
44
+ use_uv: true
45
+ cwt_hidden_size: 128
46
+ cwt_layers: 2
47
+ cwt_loss: l1
48
+ cwt_add_f0_loss: false
49
+ cwt_std_scale: 0.8
50
+
51
+ pitch_ar: false
52
+ pitch_embed_type: 0
53
+ pitch_loss: 'l1' # l1|l2|ssim
54
+ pitch_ssim_win: 11
55
+ use_energy_embed: false
56
+
57
+ # reference encoder and speaker embedding
58
+ use_ref_enc: false
59
+ use_var_enc: false
60
+ lambda_commit: 0.25
61
+ var_enc_vq_codes: 64
62
+ ref_norm_layer: bn
63
+ dec_inp_add_noise: false
64
+ sil_add_noise: false
65
+ ref_hidden_stride_kernel:
66
+ - 0,3,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
67
+ - 0,3,5
68
+ - 0,2,5
69
+ - 0,2,5
70
+ - 0,2,5
71
+ pitch_enc_hidden_stride_kernel:
72
+ - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
73
+ - 0,2,5
74
+ - 0,2,5
75
+ dur_enc_hidden_stride_kernel:
76
+ - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
77
+ - 0,2,3
78
+ - 0,1,3
79
+
80
+ # mel
81
+ mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
82
+
83
+ # loss lambda
84
+ lambda_f0: 1.0
85
+ lambda_uv: 1.0
86
+ lambda_energy: 0.1
87
+ lambda_ph_dur: 0.1
88
+ lambda_sent_dur: 1.0
89
+ lambda_word_dur: 1.0
90
+ predictor_grad: 0.1
91
+
92
+ # train and eval
93
+ pretrain_fs_ckpt: ''
94
+ warmup_updates: 2000
95
+ max_tokens: 32000
96
+ max_sentences: 100000
97
+ max_valid_sentences: 1
98
+ max_updates: 120000
99
+ use_gt_dur: false
100
+ use_gt_f0: false
101
+ ds_workers: 2
102
+ lr: 1.0
egs/egs_bases/tts/vocoder/base.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ../base.yaml
2
+ binarization_args:
3
+ with_wav: true
4
+ with_spk_embed: false
5
+ with_align: false
6
+ with_word: false
7
+ with_txt: false
8
+
9
+ ###########
10
+ # train and eval
11
+ ###########
12
+ max_samples: 25600
13
+ max_sentences: 5
14
+ max_valid_sentences: 1
15
+ max_updates: 1000000
16
+ val_check_interval: 2000
17
+
18
+ ###########################################################
19
+ # FEATURE EXTRACTION SETTING #
20
+ ###########################################################
21
+ fft_size: 1024 # FFT size.
22
+ hop_size: 256 # Hop size.
23
+ win_length: null # Window length.
24
+ # If set to null, it will be the same as fft_size.
25
+ window: "hann" # Window function.
26
+ num_mels: 80 # Number of mel basis.
27
+ fmin: 80 # Minimum freq in mel basis calculation.
28
+ fmax: 7600 # Maximum frequency in mel basis calculation.
29
+ aux_context_window: 0 # Context window size for auxiliary feature.
30
+ use_pitch_embed: false
31
+
32
+ generator_grad_norm: 10 # Generator's gradient norm.
33
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
34
+ disc_start_steps: 40000 # Number of steps to start to train discriminator.
egs/egs_bases/tts/vocoder/pwg.yaml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./base.yaml
2
+ task_cls: tasks.vocoder.pwg.PwgTask
3
+
4
+ aux_context_window: 2 # Context window size for auxiliary feature.
5
+ use_pitch_embed: false
6
+ ###########################################################
7
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
8
+ ###########################################################
9
+ generator_params:
10
+ in_channels: 1 # Number of input channels.
11
+ out_channels: 1 # Number of output channels.
12
+ kernel_size: 3 # Kernel size of dilated convolution.
13
+ layers: 30 # Number of residual block layers.
14
+ stacks: 3 # Number of stacks i.e., dilation cycles.
15
+ residual_channels: 64 # Number of channels in residual conv.
16
+ gate_channels: 128 # Number of channels in gated conv.
17
+ skip_channels: 64 # Number of channels in skip conv.
18
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
19
+ # Must be the same as num_mels.
20
+ # If set to 2, previous 2 and future 2 frames will be considered.
21
+ dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
22
+ use_weight_norm: true # Whether to use weight norm.
23
+ # If set to true, it will be applied to all of the conv layers.
24
+ upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
25
+ upsample_params: # Upsampling network parameters.
26
+ upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
27
+ use_pitch_embed: false
28
+ use_nsf: false
29
+ ###########################################################
30
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
31
+ ###########################################################
32
+ discriminator_params:
33
+ in_channels: 1 # Number of input channels.
34
+ out_channels: 1 # Number of output channels.
35
+ kernel_size: 3 # Number of output channels.
36
+ layers: 10 # Number of conv layers.
37
+ conv_channels: 64 # Number of chnn layers.
38
+ bias: true # Whether to use bias parameter in conv.
39
+ use_weight_norm: true # Whether to use weight norm.
40
+ # If set to true, it will be applied to all of the conv layers.
41
+ nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
42
+ nonlinear_activation_params: # Nonlinear function parameters
43
+ negative_slope: 0.2 # Alpha in LeakyReLU.
44
+ rerun_gen: true
45
+
46
+ ###########################################################
47
+ # STFT LOSS SETTING #
48
+ ###########################################################
49
+ stft_loss_params:
50
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
51
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
52
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
53
+ window: "hann_window" # Window function for STFT-based loss
54
+ use_mel_loss: false
55
+
56
+ ###########################################################
57
+ # ADVERSARIAL LOSS SETTING #
58
+ ###########################################################
59
+ lambda_adv: 4.0 # Loss balancing coefficient.
60
+
61
+ ###########################################################
62
+ # OPTIMIZER & SCHEDULER SETTING #
63
+ ###########################################################
64
+ generator_optimizer_params:
65
+ lr: 0.0001 # Generator's learning rate.
66
+ eps: 1.0e-6 # Generator's epsilon.
67
+ weight_decay: 0.0 # Generator's weight decay coefficient.
68
+ generator_scheduler_params:
69
+ step_size: 200000 # Generator's scheduler step size.
70
+ gamma: 0.5 # Generator's scheduler gamma.
71
+ # At each step size, lr will be multiplied by this parameter.
72
+ generator_grad_norm: 10 # Generator's gradient norm.
73
+ discriminator_optimizer_params:
74
+ lr: 0.00005 # Discriminator's learning rate.
75
+ eps: 1.0e-6 # Discriminator's epsilon.
76
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
77
+ discriminator_scheduler_params:
78
+ step_size: 200000 # Discriminator's scheduler step size.
79
+ gamma: 0.5 # Discriminator's scheduler gamma.
80
+ # At each step size, lr will be multiplied by this parameter.
81
+ discriminator_grad_norm: 1 # Discriminator's gradient norm.
82
+ disc_start_steps: 40000 # Number of steps to start to train discriminator.
inference/ProDiff.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from inference.base_tts_infer import BaseTTSInfer
3
+ from utils.ckpt_utils import load_ckpt, get_last_checkpoint
4
+ from utils.hparams import hparams
5
+ from modules.ProDiff.model.ProDiff import GaussianDiffusion
6
+ from usr.diff.net import DiffNet
7
+ import os
8
+ import numpy as np
9
+ from functools import partial
10
+
11
+ class ProDiffInfer(BaseTTSInfer):
12
+ def build_model(self):
13
+ f0_stats_fn = f'{hparams["binary_data_dir"]}/train_f0s_mean_std.npy'
14
+ if os.path.exists(f0_stats_fn):
15
+ hparams['f0_mean'], hparams['f0_std'] = np.load(f0_stats_fn)
16
+ hparams['f0_mean'] = float(hparams['f0_mean'])
17
+ hparams['f0_std'] = float(hparams['f0_std'])
18
+ model = GaussianDiffusion(
19
+ phone_encoder=self.ph_encoder,
20
+ out_dims=80, denoise_fn=DiffNet(hparams['audio_num_mel_bins']),
21
+ timesteps=hparams['timesteps'],
22
+ loss_type=hparams['diff_loss_type'],
23
+ spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
24
+ )
25
+ checkpoint = torch.load(hparams['teacher_ckpt'], map_location='cpu')["state_dict"]['model']
26
+ teacher_timesteps = int(checkpoint['timesteps'].item())
27
+ teacher_timescales = int(checkpoint['timescale'].item())
28
+ student_timesteps = teacher_timesteps // 2
29
+ student_timescales = teacher_timescales * 2
30
+ to_torch = partial(torch.tensor, dtype=torch.float32)
31
+ model.register_buffer('timesteps', to_torch(student_timesteps)) # beta
32
+ model.register_buffer('timescale', to_torch(student_timescales)) # beta
33
+ model.eval()
34
+ load_ckpt(model, hparams['work_dir'], 'model')
35
+ return model
36
+
37
+ def forward_model(self, inp):
38
+ sample = self.input_to_batch(inp)
39
+ txt_tokens = sample['txt_tokens'] # [B, T_t]
40
+ with torch.no_grad():
41
+ output = self.model(txt_tokens, infer=True)
42
+ mel_out = output['mel_out']
43
+ wav_out = self.run_vocoder(mel_out)
44
+ wav_out = wav_out.squeeze().cpu().numpy()
45
+ return wav_out
46
+
47
+
48
+ if __name__ == '__main__':
49
+ ProDiffInfer.example_run()
inference/ProDiff_Teacher.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from inference.base_tts_infer import BaseTTSInfer
3
+ from utils.ckpt_utils import load_ckpt, get_last_checkpoint
4
+ from utils.hparams import hparams
5
+ from modules.ProDiff.model.ProDiff_teacher import GaussianDiffusion
6
+ from usr.diff.net import DiffNet
7
+ import os
8
+ import numpy as np
9
+
10
+ class ProDiffTeacherInfer(BaseTTSInfer):
11
+ def build_model(self):
12
+ f0_stats_fn = f'{hparams["binary_data_dir"]}/train_f0s_mean_std.npy'
13
+ if os.path.exists(f0_stats_fn):
14
+ hparams['f0_mean'], hparams['f0_std'] = np.load(f0_stats_fn)
15
+ hparams['f0_mean'] = float(hparams['f0_mean'])
16
+ hparams['f0_std'] = float(hparams['f0_std'])
17
+ model = GaussianDiffusion(
18
+ phone_encoder=self.ph_encoder,
19
+ out_dims=80, denoise_fn=DiffNet(hparams['audio_num_mel_bins']),
20
+ timesteps=hparams['timesteps'],
21
+ loss_type=hparams['diff_loss_type'],
22
+ spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
23
+ )
24
+
25
+ model.eval()
26
+ load_ckpt(model, hparams['work_dir'], 'model')
27
+ return model
28
+
29
+ def forward_model(self, inp):
30
+ sample = self.input_to_batch(inp)
31
+ txt_tokens = sample['txt_tokens'] # [B, T_t]
32
+ with torch.no_grad():
33
+ output = self.model(txt_tokens, infer=True)
34
+ mel_out = output['mel_out']
35
+ wav_out = self.run_vocoder(mel_out)
36
+ wav_out = wav_out.squeeze().cpu().numpy()
37
+ return wav_out
38
+
39
+
40
+ if __name__ == '__main__':
41
+ ProDiffTeacherInfer.example_run()
inference/base_tts_infer.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+
5
+ from tasks.tts.dataset_utils import FastSpeechWordDataset
6
+ from tasks.tts.tts_utils import load_data_preprocessor
7
+ import numpy as np
8
+ from modules.FastDiff.module.util import compute_hyperparams_given_schedule, sampling_given_noise_schedule
9
+
10
+ import os
11
+
12
+ import torch
13
+
14
+ from modules.FastDiff.module.FastDiff_model import FastDiff
15
+ from utils.ckpt_utils import load_ckpt
16
+ from utils.hparams import set_hparams
17
+
18
+
19
+ class BaseTTSInfer:
20
+ def __init__(self, hparams, device=None):
21
+ if device is None:
22
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
23
+ self.hparams = hparams
24
+ self.device = device
25
+ self.data_dir = hparams['binary_data_dir']
26
+ self.preprocessor, self.preprocess_args = load_data_preprocessor()
27
+ self.ph_encoder = self.preprocessor.load_dict(self.data_dir)
28
+ self.spk_map = self.preprocessor.load_spk_map(self.data_dir)
29
+ self.ds_cls = FastSpeechWordDataset
30
+ self.model = self.build_model()
31
+ self.model.eval()
32
+ self.model.to(self.device)
33
+ self.vocoder, self.diffusion_hyperparams, self.noise_schedule = self.build_vocoder()
34
+ self.vocoder.eval()
35
+ self.vocoder.to(self.device)
36
+
37
+ def build_model(self):
38
+ raise NotImplementedError
39
+
40
+ def forward_model(self, inp):
41
+ raise NotImplementedError
42
+
43
+ def build_vocoder(self):
44
+ base_dir = self.hparams['vocoder_ckpt']
45
+ config_path = f'{base_dir}/config.yaml'
46
+ config = set_hparams(config_path, global_hparams=False)
47
+ vocoder = FastDiff(audio_channels=config['audio_channels'],
48
+ inner_channels=config['inner_channels'],
49
+ cond_channels=config['cond_channels'],
50
+ upsample_ratios=config['upsample_ratios'],
51
+ lvc_layers_each_block=config['lvc_layers_each_block'],
52
+ lvc_kernel_size=config['lvc_kernel_size'],
53
+ kpnet_hidden_channels=config['kpnet_hidden_channels'],
54
+ kpnet_conv_size=config['kpnet_conv_size'],
55
+ dropout=config['dropout'],
56
+ diffusion_step_embed_dim_in=config['diffusion_step_embed_dim_in'],
57
+ diffusion_step_embed_dim_mid=config['diffusion_step_embed_dim_mid'],
58
+ diffusion_step_embed_dim_out=config['diffusion_step_embed_dim_out'],
59
+ use_weight_norm=config['use_weight_norm'])
60
+ load_ckpt(vocoder, base_dir, 'model')
61
+
62
+ # Init hyperparameters by linear schedule
63
+ noise_schedule = torch.linspace(float(config["beta_0"]), float(config["beta_T"]), int(config["T"]))
64
+ diffusion_hyperparams = compute_hyperparams_given_schedule(noise_schedule)
65
+
66
+ if config['noise_schedule'] != '':
67
+ noise_schedule = config['noise_schedule']
68
+ if isinstance(noise_schedule, list):
69
+ noise_schedule = torch.FloatTensor(noise_schedule)
70
+ else:
71
+ # Select Schedule
72
+ try:
73
+ reverse_step = int(self.hparams.get('N'))
74
+ except:
75
+ print(
76
+ 'Please specify $N (the number of revere iterations) in config file. Now denoise with 4 iterations.')
77
+ reverse_step = 4
78
+ if reverse_step == 1000:
79
+ noise_schedule = torch.linspace(0.000001, 0.01, 1000)
80
+ elif reverse_step == 200:
81
+ noise_schedule = torch.linspace(0.0001, 0.02, 200)
82
+
83
+ # Below are schedules derived by Noise Predictor.
84
+ # We will release codes of noise predictor training process & noise scheduling process soon. Please Stay Tuned!
85
+ elif reverse_step == 8:
86
+ noise_schedule = [6.689325005027058e-07, 1.0033881153503899e-05, 0.00015496854030061513,
87
+ 0.002387222135439515, 0.035597629845142365, 0.3681158423423767, 0.4735414385795593,
88
+ 0.5]
89
+ elif reverse_step == 6:
90
+ noise_schedule = [1.7838445955931093e-06, 2.7984189728158526e-05, 0.00043231004383414984,
91
+ 0.006634317338466644, 0.09357017278671265, 0.6000000238418579]
92
+ elif reverse_step == 4:
93
+ noise_schedule = [3.2176e-04, 2.5743e-03, 2.5376e-02, 7.0414e-01]
94
+ elif reverse_step == 3:
95
+ noise_schedule = [9.0000e-05, 9.0000e-03, 6.0000e-01]
96
+ else:
97
+ raise NotImplementedError
98
+
99
+ if isinstance(noise_schedule, list):
100
+ noise_schedule = torch.FloatTensor(noise_schedule)
101
+
102
+ return vocoder, diffusion_hyperparams, noise_schedule
103
+
104
+ def run_vocoder(self, c):
105
+ c = c.transpose(2, 1)
106
+ audio_length = c.shape[-1] * self.hparams["hop_size"]
107
+ y = sampling_given_noise_schedule(
108
+ self.vocoder, (1, 1, audio_length), self.diffusion_hyperparams, self.noise_schedule, condition=c, ddim=False, return_sequence=False)
109
+ return y
110
+
111
+ def preprocess_input(self, inp):
112
+ """
113
+ :param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
114
+ :return:
115
+ """
116
+ preprocessor, preprocess_args = self.preprocessor, self.preprocess_args
117
+ text_raw = inp['text']
118
+ item_name = inp.get('item_name', '<ITEM_NAME>')
119
+ spk_name = inp.get('spk_name', 'SPK1')
120
+ ph, txt = preprocessor.txt_to_ph(
121
+ preprocessor.txt_processor, text_raw, preprocess_args)
122
+ ph_token = self.ph_encoder.encode(ph)
123
+ spk_id = self.spk_map[spk_name]
124
+ item = {'item_name': item_name, 'text': txt, 'ph': ph, 'spk_id': spk_id, 'ph_token': ph_token}
125
+ item['ph_len'] = len(item['ph_token'])
126
+ return item
127
+
128
+ def input_to_batch(self, item):
129
+ item_names = [item['item_name']]
130
+ text = [item['text']]
131
+ ph = [item['ph']]
132
+ txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
133
+ txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
134
+ spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
135
+ batch = {
136
+ 'item_name': item_names,
137
+ 'text': text,
138
+ 'ph': ph,
139
+ 'txt_tokens': txt_tokens,
140
+ 'txt_lengths': txt_lengths,
141
+ 'spk_ids': spk_ids,
142
+ }
143
+ return batch
144
+
145
+ def postprocess_output(self, output):
146
+ return output
147
+
148
+ def infer_once(self, inp):
149
+ inp = self.preprocess_input(inp)
150
+ output = self.forward_model(inp)
151
+ output = self.postprocess_output(output)
152
+ return output
153
+
154
+ @classmethod
155
+ def example_run(cls):
156
+ from utils.hparams import set_hparams
157
+ from utils.hparams import hparams as hp
158
+ from utils.audio import save_wav
159
+
160
+ set_hparams()
161
+ inp = {
162
+ 'text': hp['text']
163
+ }
164
+ infer_ins = cls(hp)
165
+ out = infer_ins.infer_once(inp)
166
+ os.makedirs('infer_out', exist_ok=True)
167
+ save_wav(out, f'infer_out/{hp["text"]}.wav', hp['audio_sample_rate'])
inference/gradio/gradio_settings.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: 'Extremely-Fast diffusion text-to-speech synthesis pipeline with ProDiff and FastDiff'
2
+ description: |
3
+ Gradio demo for **2-iter** ProDiff and **4-iter** FastDiff. To use it, simply add your audio, or click one of the examples to load them. **This space is running on CPU, inference will be slower.**
4
+
5
+ ## Key Features
6
+ - **Extremely-Fast** diffusion text-to-speech synthesis pipeline for potential **industrial deployment**.
7
+ - **Tutorial and code base** for speech diffusion models.
8
+ - More **supported diffusion mechanism** (e.g., guided diffusion) will be available.
9
+
10
+
11
+ article: |
12
+ ## Reference
13
+ Link to <a href='https://github.com/Rongjiehuang/ProDiff' style='color:blue;' target='_blank\'>ProDiff Github REPO</a>
14
+
15
+ If you find this code useful in your research, please cite our work:
16
+ ```
17
+ @inproceedings{huang2022prodiff,
18
+ title={ProDiff: Progressive Fast Diffusion Model For High-Quality Text-to-Speech},
19
+ author={Huang, Rongjie and Zhao, Zhou and Liu, Huadai and Liu, Jinglin and Cui, Chenye and Ren, Yi},
20
+ booktitle={Proceedings of the 30th ACM International Conference on Multimedia},
21
+ year={2022}
22
+
23
+ @inproceedings{huang2022fastdiff,
24
+ title={FastDiff: A Fast Conditional Diffusion Model for High-Quality Speech Synthesis},
25
+ author={Huang, Rongjie and Lam, Max WY and Wang, Jun and Su, Dan and Yu, Dong and Ren, Yi and Zhao, Zhou},
26
+ booktitle = {Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, {IJCAI-22}},
27
+ year={2022}
28
+ }
29
+ ```
30
+
31
+ ## Disclaimer
32
+ Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
33
+
34
+ example_inputs:
35
+ - |-
36
+ the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
37
+ - |-
38
+ Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition.
39
+ inference_cls: inference.ProDiff.ProDiffInfer
40
+ exp_name: ProDiff
41
+ config: modules/ProDiff/config/prodiff.yaml
inference/gradio/infer.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import re
3
+
4
+ import gradio as gr
5
+ import yaml
6
+ from gradio.inputs import Textbox
7
+
8
+ from inference.base_tts_infer import BaseTTSInfer
9
+ from utils.hparams import set_hparams
10
+ from utils.hparams import hparams as hp
11
+ import numpy as np
12
+
13
+ from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
14
+
15
+ class GradioInfer:
16
+ def __init__(self, exp_name, config, inference_cls, title, description, article, example_inputs):
17
+ self.exp_name = exp_name
18
+ self.config = config
19
+ self.title = title
20
+ self.description = description
21
+ self.article = article
22
+ self.example_inputs = example_inputs
23
+ pkg = ".".join(inference_cls.split(".")[:-1])
24
+ cls_name = inference_cls.split(".")[-1]
25
+ self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
26
+
27
+ def greet(self, text):
28
+ sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
29
+ if sents[-1] not in list(PUNCS):
30
+ sents = sents + ['.']
31
+ audio_outs = []
32
+ s = ""
33
+ for i in range(0, len(sents), 2):
34
+ if len(sents[i]) > 0:
35
+ s += sents[i] + sents[i + 1]
36
+ if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
37
+ audio_out = self.infer_ins.infer_once({
38
+ 'text': s
39
+ })
40
+ audio_out = audio_out * 32767
41
+ audio_out = audio_out.astype(np.int16)
42
+ audio_outs.append(audio_out)
43
+ audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
44
+ s = ""
45
+ audio_outs = np.concatenate(audio_outs)
46
+ return hp['audio_sample_rate'], audio_outs
47
+
48
+ def run(self):
49
+ set_hparams(exp_name=self.exp_name, config=self.config)
50
+ infer_cls = self.inference_cls
51
+ self.infer_ins: BaseTTSInfer = infer_cls(hp)
52
+ example_inputs = self.example_inputs
53
+ iface = gr.Interface(fn=self.greet,
54
+ inputs=Textbox(
55
+ lines=10, placeholder=None, default=example_inputs[0], label="input text"),
56
+ outputs="audio",
57
+ allow_flagging="never",
58
+ title=self.title,
59
+ description=self.description,
60
+ article=self.article,
61
+ examples=example_inputs,
62
+ enable_queue=True)
63
+ iface.launch(share=True,cache_examples=True)
64
+
65
+
66
+ if __name__ == '__main__':
67
+ gradio_config = yaml.safe_load(open('inference/gradio/gradio_settings.yaml'))
68
+ g = GradioInfer(**gradio_config)
69
+ g.run()
modules/FastDiff/config/FastDiff.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - ./base.yaml
3
+
4
+ audio_sample_rate: 22050
5
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
6
+ processed_data_dir: 'data/processed/LJSpeech'
7
+ binary_data_dir: 'data/binary/LJSpeech'
modules/FastDiff/config/FastDiff_libritts.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - ./base.yaml
3
+
4
+ audio_sample_rate: 22050
5
+ raw_data_dir: 'data/raw/LibriTTS'
6
+ processed_data_dir: 'data/processed/LibriTTS'
7
+ binary_data_dir: 'data/binary/LibriTTS'
modules/FastDiff/config/FastDiff_sc09.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/vocoder/base.yaml
3
+ - egs/datasets/audio/lj/base_mel2wav.yaml
4
+ - ./base.yaml
5
+
6
+ #raw_data_dir: '/home1/huangrongjie/dataset/sc09/data/'
7
+ #processed_data_dir: 'data/processed/SC09'
8
+ #binary_data_dir: 'data/binary/SC09'
9
+
10
+ raw_data_dir: '/home1/huangrongjie/Project/AdaGrad/data/raw/SC09/'
11
+ processed_data_dir: 'data/processed/SC09_ten_processed'
12
+ binary_data_dir: 'data/binary/SC09_ten_processed'
13
+
14
+ pre_align_cls: egs.datasets.audio.sc09.pre_align.Sc09PreAlign
15
+ audio_sample_rate: 16000
16
+ max_samples: 12800
17
+
18
+ pre_align_args:
19
+ sox_resample: false
20
+ sox_to_wav: false
21
+ allow_no_txt: true
22
+ trim_sil: true
23
+ denoise: true
24
+
25
+ loud_norm: true
modules/FastDiff/config/FastDiff_tacotron.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/vocoder/pwg.yaml
3
+ - egs/egs_bases/tts/base_mel2wav.yaml
4
+ - egs/datasets/audio/lj/pwg.yaml
5
+
6
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
7
+ processed_data_dir: 'data/processed/LJSpeech_FastDiff'
8
+ #binary_data_dir: 'data/binary/LJSpeech_Taco'
9
+ binary_data_dir: /apdcephfs/private_nlphuang/preprocess/AdaGrad/data/binary/LJSpeech_Taco
10
+
11
+ binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
12
+ pre_align_cls: egs.datasets.audio.lj.pre_align.LJPreAlign
13
+ task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
14
+ binarization_args:
15
+ with_wav: true
16
+ with_spk_embed: false
17
+ with_align: false
18
+ with_word: false
19
+ with_txt: false
20
+ with_f0: false
21
+
22
+ # data
23
+ num_spk: 400
24
+ max_samples: 25600
25
+ aux_context_window: 0
26
+ max_sentences: 20
27
+ test_input_dir: '' # 'wavs' # wav->wav inference
28
+ test_mel_dir: '' # 'mels' # mel->wav inference
29
+ use_wav: True # mel->wav inference
30
+
31
+ # training
32
+ num_sanity_val_steps: -1
33
+ max_updates: 1000000
34
+ lr: 2e-4
35
+ weight_decay: 0
36
+
37
+ # FastDiff
38
+ audio_channels: 1
39
+ inner_channels: 32
40
+ cond_channels: 80
41
+ upsample_ratios: [8, 8, 4]
42
+ lvc_layers_each_block: 4
43
+ lvc_kernel_size: 3
44
+ kpnet_hidden_channels: 64
45
+ kpnet_conv_size: 3
46
+ dropout: 0.0
47
+ diffusion_step_embed_dim_in: 128
48
+ diffusion_step_embed_dim_mid: 512
49
+ diffusion_step_embed_dim_out: 512
50
+ use_weight_norm: True
51
+
52
+ # Diffusion
53
+ T: 1000
54
+ beta_0: 0.000001
55
+ beta_T: 0.01
56
+ noise_schedule: ''
57
+ N: ''
58
+
modules/FastDiff/config/FastDiff_vctk.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - ./base.yaml
3
+
4
+ audio_sample_rate: 22050
5
+ raw_data_dir: 'data/raw/VCTK'
6
+ processed_data_dir: 'data/processed/VCTK'
7
+ binary_data_dir: 'data/binary/VCTK'
modules/FastDiff/config/base.yaml ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #############
2
+ # Custom dataset preprocess
3
+ #############
4
+ audio_num_mel_bins: 80
5
+ audio_sample_rate: 22050
6
+ hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
7
+ win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
8
+ fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
9
+ fmax: 7600 # To be increased/reduced depending on data.
10
+ fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
11
+ min_level_db: -100
12
+ ref_level_db: 20
13
+ griffin_lim_iters: 60
14
+ num_spk: 1 # number of speakers
15
+ mel_vmin: -6
16
+ mel_vmax: 1.5
17
+
18
+ #############
19
+ # FastDiff Model
20
+ #############
21
+ audio_channels: 1
22
+ inner_channels: 32
23
+ cond_channels: 80
24
+ upsample_ratios: [8, 8, 4]
25
+ lvc_layers_each_block: 4
26
+ lvc_kernel_size: 3
27
+ kpnet_hidden_channels: 64
28
+ kpnet_conv_size: 3
29
+ dropout: 0.0
30
+ diffusion_step_embed_dim_in: 128
31
+ diffusion_step_embed_dim_mid: 512
32
+ diffusion_step_embed_dim_out: 512
33
+ use_weight_norm: True
34
+
35
+ ###########
36
+ # Diffusion
37
+ ###########
38
+ T: 1000
39
+ beta_0: 0.000001
40
+ beta_T: 0.01
41
+ noise_schedule: ''
42
+ N: ''
43
+
44
+
45
+ ###########
46
+ # train and eval
47
+ ###########
48
+ task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
49
+ max_updates: 1000000 # max training steps
50
+ max_samples: 25600 # audio length in training
51
+ max_sentences: 20 # max batch size in training
52
+ num_sanity_val_steps: -1
53
+ max_valid_sentences: 1
54
+ valid_infer_interval: 10000
55
+ val_check_interval: 2000
56
+ num_test_samples: 0
57
+ num_valid_plots: 10
58
+
59
+
60
+ #############
61
+ # Stage 1 of data processing
62
+ #############
63
+ pre_align_cls: egs.datasets.audio.pre_align.PreAlign
64
+ pre_align_args:
65
+ nsample_per_mfa_group: 1000
66
+ txt_processor: en
67
+ use_tone: true # for ZH
68
+ sox_resample: false
69
+ sox_to_wav: false
70
+ allow_no_txt: true
71
+ trim_sil: false
72
+ denoise: false
73
+
74
+
75
+ #############
76
+ # Stage 2 of data processing
77
+ #############
78
+ binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
79
+ binarization_args:
80
+ with_wav: true
81
+ with_spk_embed: false
82
+ with_align: false
83
+ with_word: false
84
+ with_txt: false
85
+ with_f0: false
86
+ shuffle: false
87
+ with_spk_id: true
88
+ with_f0cwt: false
89
+ with_linear: false
90
+ trim_eos_bos: false
91
+ reset_phone_dict: true
92
+ reset_word_dict: true
93
+
94
+
95
+ ###########
96
+ # optimization
97
+ ###########
98
+ lr: 2e-4 # learning rate
99
+ weight_decay: 0
100
+ scheduler: rsqrt # rsqrt|none
101
+ optimizer_adam_beta1: 0.9
102
+ optimizer_adam_beta2: 0.98
103
+ clip_grad_norm: 1
104
+ clip_grad_value: 0
105
+
106
+ #############
107
+ # Setting for this Pytorch framework
108
+ #############
109
+ max_input_tokens: 1550
110
+ frames_multiple: 1
111
+ use_word_input: false
112
+ vocoder: FastDiff
113
+ vocoder_ckpt: checkpoints/FastDiff
114
+ vocoder_denoise_c: 0.0
115
+ max_tokens: 30000
116
+ max_valid_tokens: 60000
117
+ test_ids: [ ]
118
+ profile_infer: false
119
+ out_wav_norm: false
120
+ save_gt: true
121
+ save_f0: false
122
+ aux_context_window: 0
123
+ test_input_dir: '' # 'wavs' # wav->wav inference
124
+ test_mel_dir: '' # 'mels' # mel->wav inference
125
+ use_wav: True # mel->wav inference
126
+ pitch_extractor: parselmouth
127
+ loud_norm: false
128
+ endless_ds: true
129
+ test_num: 100
130
+ min_frames: 0
131
+ max_frames: 1548
132
+ ds_workers: 1
133
+ gen_dir_name: ''
134
+ accumulate_grad_batches: 1
135
+ tb_log_interval: 100
136
+ print_nan_grads: false
137
+ work_dir: '' # experiment directory.
138
+ infer: false # inference
139
+ amp: false
140
+ debug: false
141
+ save_codes: []
142
+ save_best: true
143
+ num_ckpt_keep: 3
144
+ sort_by_len: true
145
+ load_ckpt: ''
146
+ check_val_every_n_epoch: 10
147
+ max_epochs: 1000
148
+ eval_max_batches: -1
149
+ resume_from_checkpoint: 0
150
+ rename_tmux: true
151
+ valid_monitor_key: 'val_loss'
152
+ valid_monitor_mode: 'min'
153
+ train_set_name: 'train'
154
+ train_sets: ''
155
+ valid_set_name: 'valid'
156
+ test_set_name: 'test'
157
+ seed: 1234