Spaces:
Sleeping
Sleeping
yuancwang
commited on
Commit
·
5548515
1
Parent(s):
4387736
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +62 -0
- config/audioldm.json +92 -0
- config/autoencoderkl.json +69 -0
- config/base.json +220 -0
- config/comosvc.json +216 -0
- config/diffusion.json +227 -0
- config/fs2.json +118 -0
- config/ns2.json +88 -0
- config/transformer.json +180 -0
- config/tts.json +23 -0
- config/valle.json +53 -0
- config/vits.json +101 -0
- config/vitssvc.json +192 -0
- config/vocoder.json +84 -0
- egs/datasets/README.md +381 -0
- egs/metrics/README.md +94 -0
- egs/metrics/run.sh +42 -0
- egs/svc/DiffComoSVC/README.md +234 -0
- egs/svc/DiffComoSVC/exp_config.json +143 -0
- egs/svc/DiffComoSVC/run.sh +1 -0
- egs/svc/MultipleContentsSVC/README.md +153 -0
- egs/svc/MultipleContentsSVC/exp_config.json +126 -0
- egs/svc/MultipleContentsSVC/run.sh +1 -0
- egs/svc/README.md +34 -0
- egs/svc/TransformerSVC/README.md +164 -0
- egs/svc/TransformerSVC/exp_config.json +108 -0
- egs/svc/TransformerSVC/run.sh +1 -0
- egs/svc/VitsSVC/README.md +125 -0
- egs/svc/VitsSVC/exp_config.json +162 -0
- egs/svc/VitsSVC/run.sh +1 -0
- egs/svc/_template/run.sh +150 -0
- egs/tta/README.md +19 -0
- egs/tta/RECIPE.md +156 -0
- egs/tta/audioldm/exp_config.json +90 -0
- egs/tta/audioldm/exp_config_base.json +11 -0
- egs/tta/audioldm/exp_config_latent_4_10_78.json +88 -0
- egs/tta/audioldm/run_inference.sh +52 -0
- egs/tta/audioldm/run_inference_latent_4_10_78.sh +52 -0
- egs/tta/audioldm/run_train.sh +26 -0
- egs/tta/audioldm/run_train_latent_4_10_78.sh +26 -0
- egs/tta/autoencoderkl/exp_config.json +49 -0
- egs/tta/autoencoderkl/exp_config_base.json +11 -0
- egs/tta/autoencoderkl/exp_config_latent_4_10_78.json +59 -0
- egs/tta/autoencoderkl/run_train.sh +26 -0
- egs/tta/autoencoderkl/run_train_latent_4_10_78.sh +26 -0
- egs/tts/FastSpeech2/README.md +132 -0
- egs/tts/FastSpeech2/exp_config.json +21 -0
- egs/tts/FastSpeech2/prepare_mfa.sh +14 -0
- egs/tts/FastSpeech2/run.sh +150 -0
- egs/tts/NaturalSpeech2/exp_config.json +39 -0
.gitignore
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Mac OS files
|
2 |
+
.DS_Store
|
3 |
+
|
4 |
+
# IDEs
|
5 |
+
.idea
|
6 |
+
.vs
|
7 |
+
.vscode
|
8 |
+
.cache
|
9 |
+
|
10 |
+
# GitHub files
|
11 |
+
.github
|
12 |
+
|
13 |
+
# Byte-compiled / optimized / DLL / cached files
|
14 |
+
__pycache__/
|
15 |
+
*.py[cod]
|
16 |
+
*$py.class
|
17 |
+
*.pyc
|
18 |
+
.temp
|
19 |
+
*.c
|
20 |
+
*.so
|
21 |
+
*.o
|
22 |
+
|
23 |
+
# Developing mode
|
24 |
+
_*.sh
|
25 |
+
_*.json
|
26 |
+
*.lst
|
27 |
+
yard*
|
28 |
+
*.out
|
29 |
+
evaluation/evalset_selection
|
30 |
+
mfa
|
31 |
+
egs/svc/*wavmark
|
32 |
+
egs/svc/custom
|
33 |
+
egs/svc/*/dev*
|
34 |
+
egs/svc/dev_exp_config.json
|
35 |
+
bins/svc/demo*
|
36 |
+
bins/svc/preprocess_custom.py
|
37 |
+
data
|
38 |
+
ckpts
|
39 |
+
|
40 |
+
# Data and ckpt
|
41 |
+
*.pkl
|
42 |
+
*.pt
|
43 |
+
*.npy
|
44 |
+
*.npz
|
45 |
+
!modules/whisper_extractor/assets/mel_filters.npz
|
46 |
+
*.tar.gz
|
47 |
+
*.ckpt
|
48 |
+
*.wav
|
49 |
+
*.flac
|
50 |
+
pretrained/wenet/*conformer_exp
|
51 |
+
|
52 |
+
# Runtime data dirs
|
53 |
+
processed_data
|
54 |
+
data
|
55 |
+
model_ckpt
|
56 |
+
logs
|
57 |
+
*.ipynb
|
58 |
+
*.lst
|
59 |
+
source_audio
|
60 |
+
result
|
61 |
+
conversion_results
|
62 |
+
get_available_gpu.py
|
config/audioldm.json
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "AudioLDM",
|
4 |
+
"task_type": "tta",
|
5 |
+
"dataset": [
|
6 |
+
"AudioCaps"
|
7 |
+
],
|
8 |
+
"preprocess": {
|
9 |
+
// feature used for model training
|
10 |
+
"use_spkid": false,
|
11 |
+
"use_uv": false,
|
12 |
+
"use_frame_pitch": false,
|
13 |
+
"use_phone_pitch": false,
|
14 |
+
"use_frame_energy": false,
|
15 |
+
"use_phone_energy": false,
|
16 |
+
"use_mel": false,
|
17 |
+
"use_audio": false,
|
18 |
+
"use_label": false,
|
19 |
+
"use_one_hot": false,
|
20 |
+
"cond_mask_prob": 0.1
|
21 |
+
},
|
22 |
+
// model
|
23 |
+
"model": {
|
24 |
+
"audioldm": {
|
25 |
+
"image_size": 32,
|
26 |
+
"in_channels": 4,
|
27 |
+
"out_channels": 4,
|
28 |
+
"model_channels": 256,
|
29 |
+
"attention_resolutions": [
|
30 |
+
4,
|
31 |
+
2,
|
32 |
+
1
|
33 |
+
],
|
34 |
+
"num_res_blocks": 2,
|
35 |
+
"channel_mult": [
|
36 |
+
1,
|
37 |
+
2,
|
38 |
+
4
|
39 |
+
],
|
40 |
+
"num_heads": 8,
|
41 |
+
"use_spatial_transformer": true,
|
42 |
+
"transformer_depth": 1,
|
43 |
+
"context_dim": 768,
|
44 |
+
"use_checkpoint": true,
|
45 |
+
"legacy": false
|
46 |
+
},
|
47 |
+
"autoencoderkl": {
|
48 |
+
"ch": 128,
|
49 |
+
"ch_mult": [
|
50 |
+
1,
|
51 |
+
1,
|
52 |
+
2,
|
53 |
+
2,
|
54 |
+
4
|
55 |
+
],
|
56 |
+
"num_res_blocks": 2,
|
57 |
+
"in_channels": 1,
|
58 |
+
"z_channels": 4,
|
59 |
+
"out_ch": 1,
|
60 |
+
"double_z": true
|
61 |
+
},
|
62 |
+
"noise_scheduler": {
|
63 |
+
"num_train_timesteps": 1000,
|
64 |
+
"beta_start": 0.00085,
|
65 |
+
"beta_end": 0.012,
|
66 |
+
"beta_schedule": "scaled_linear",
|
67 |
+
"clip_sample": false,
|
68 |
+
"steps_offset": 1,
|
69 |
+
"set_alpha_to_one": false,
|
70 |
+
"skip_prk_steps": true,
|
71 |
+
"prediction_type": "epsilon"
|
72 |
+
}
|
73 |
+
},
|
74 |
+
// train
|
75 |
+
"train": {
|
76 |
+
"lronPlateau": {
|
77 |
+
"factor": 0.9,
|
78 |
+
"patience": 100,
|
79 |
+
"min_lr": 4.0e-5,
|
80 |
+
"verbose": true
|
81 |
+
},
|
82 |
+
"adam": {
|
83 |
+
"lr": 5.0e-5,
|
84 |
+
"betas": [
|
85 |
+
0.9,
|
86 |
+
0.999
|
87 |
+
],
|
88 |
+
"weight_decay": 1.0e-2,
|
89 |
+
"eps": 1.0e-8
|
90 |
+
}
|
91 |
+
}
|
92 |
+
}
|
config/autoencoderkl.json
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "AutoencoderKL",
|
4 |
+
"task_type": "tta",
|
5 |
+
"dataset": [
|
6 |
+
"AudioCaps"
|
7 |
+
],
|
8 |
+
"preprocess": {
|
9 |
+
// feature used for model training
|
10 |
+
"use_spkid": false,
|
11 |
+
"use_uv": false,
|
12 |
+
"use_frame_pitch": false,
|
13 |
+
"use_phone_pitch": false,
|
14 |
+
"use_frame_energy": false,
|
15 |
+
"use_phone_energy": false,
|
16 |
+
"use_mel": false,
|
17 |
+
"use_audio": false,
|
18 |
+
"use_label": false,
|
19 |
+
"use_one_hot": false
|
20 |
+
},
|
21 |
+
// model
|
22 |
+
"model": {
|
23 |
+
"autoencoderkl": {
|
24 |
+
"ch": 128,
|
25 |
+
"ch_mult": [
|
26 |
+
1,
|
27 |
+
1,
|
28 |
+
2,
|
29 |
+
2,
|
30 |
+
4
|
31 |
+
],
|
32 |
+
"num_res_blocks": 2,
|
33 |
+
"in_channels": 1,
|
34 |
+
"z_channels": 4,
|
35 |
+
"out_ch": 1,
|
36 |
+
"double_z": true
|
37 |
+
},
|
38 |
+
"loss": {
|
39 |
+
"kl_weight": 1e-8,
|
40 |
+
"disc_weight": 0.5,
|
41 |
+
"disc_factor": 1.0,
|
42 |
+
"logvar_init": 0.0,
|
43 |
+
"min_adapt_d_weight": 0.0,
|
44 |
+
"max_adapt_d_weight": 10.0,
|
45 |
+
"disc_start": 50001,
|
46 |
+
"disc_in_channels": 1,
|
47 |
+
"disc_num_layers": 3,
|
48 |
+
"use_actnorm": false
|
49 |
+
}
|
50 |
+
},
|
51 |
+
// train
|
52 |
+
"train": {
|
53 |
+
"lronPlateau": {
|
54 |
+
"factor": 0.9,
|
55 |
+
"patience": 100,
|
56 |
+
"min_lr": 4.0e-5,
|
57 |
+
"verbose": true
|
58 |
+
},
|
59 |
+
"adam": {
|
60 |
+
"lr": 4.0e-4,
|
61 |
+
"betas": [
|
62 |
+
0.9,
|
63 |
+
0.999
|
64 |
+
],
|
65 |
+
"weight_decay": 1.0e-2,
|
66 |
+
"eps": 1.0e-8
|
67 |
+
}
|
68 |
+
}
|
69 |
+
}
|
config/base.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"supported_model_type": [
|
3 |
+
"GANVocoder",
|
4 |
+
"Fastspeech2",
|
5 |
+
"DiffSVC",
|
6 |
+
"Transformer",
|
7 |
+
"EDM",
|
8 |
+
"CD"
|
9 |
+
],
|
10 |
+
"task_type": "",
|
11 |
+
"dataset": [],
|
12 |
+
"use_custom_dataset": false,
|
13 |
+
"preprocess": {
|
14 |
+
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
|
15 |
+
// trim audio silence
|
16 |
+
"data_augment": false,
|
17 |
+
"trim_silence": false,
|
18 |
+
"num_silent_frames": 8,
|
19 |
+
"trim_fft_size": 512, // fft size used in trimming
|
20 |
+
"trim_hop_size": 128, // hop size used in trimming
|
21 |
+
"trim_top_db": 30, // top db used in trimming sensitive to each dataset
|
22 |
+
// acoustic features
|
23 |
+
"extract_mel": false,
|
24 |
+
"mel_extract_mode": "",
|
25 |
+
"extract_linear_spec": false,
|
26 |
+
"extract_mcep": false,
|
27 |
+
"extract_pitch": false,
|
28 |
+
"extract_acoustic_token": false,
|
29 |
+
"pitch_remove_outlier": false,
|
30 |
+
"extract_uv": false,
|
31 |
+
"pitch_norm": false,
|
32 |
+
"extract_audio": false,
|
33 |
+
"extract_label": false,
|
34 |
+
"pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
|
35 |
+
"extract_energy": false,
|
36 |
+
"energy_remove_outlier": false,
|
37 |
+
"energy_norm": false,
|
38 |
+
"energy_extract_mode": "from_mel",
|
39 |
+
"extract_duration": false,
|
40 |
+
"extract_amplitude_phase": false,
|
41 |
+
"mel_min_max_norm": false,
|
42 |
+
// lingusitic features
|
43 |
+
"extract_phone": false,
|
44 |
+
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
|
45 |
+
// content features
|
46 |
+
"extract_whisper_feature": false,
|
47 |
+
"extract_contentvec_feature": false,
|
48 |
+
"extract_mert_feature": false,
|
49 |
+
"extract_wenet_feature": false,
|
50 |
+
// Settings for data preprocessing
|
51 |
+
"n_mel": 80,
|
52 |
+
"win_size": 480,
|
53 |
+
"hop_size": 120,
|
54 |
+
"sample_rate": 24000,
|
55 |
+
"n_fft": 1024,
|
56 |
+
"fmin": 0,
|
57 |
+
"fmax": 12000,
|
58 |
+
"min_level_db": -115,
|
59 |
+
"ref_level_db": 20,
|
60 |
+
"bits": 8,
|
61 |
+
// Directory names of processed data or extracted features
|
62 |
+
"processed_dir": "processed_data",
|
63 |
+
"trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
|
64 |
+
"raw_data": "raw_data",
|
65 |
+
"phone_dir": "phones",
|
66 |
+
"wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
|
67 |
+
"audio_dir": "audios",
|
68 |
+
"log_amplitude_dir": "log_amplitudes",
|
69 |
+
"phase_dir": "phases",
|
70 |
+
"real_dir": "reals",
|
71 |
+
"imaginary_dir": "imaginarys",
|
72 |
+
"label_dir": "labels",
|
73 |
+
"linear_dir": "linears",
|
74 |
+
"mel_dir": "mels", // directory name of extraced mel features
|
75 |
+
"mcep_dir": "mcep", // directory name of extraced mcep features
|
76 |
+
"dur_dir": "durs",
|
77 |
+
"symbols_dict": "symbols.dict",
|
78 |
+
"lab_dir": "labs", // directory name of extraced label features
|
79 |
+
"wenet_dir": "wenet", // directory name of extraced wenet features
|
80 |
+
"contentvec_dir": "contentvec", // directory name of extraced wenet features
|
81 |
+
"pitch_dir": "pitches", // directory name of extraced pitch features
|
82 |
+
"energy_dir": "energys", // directory name of extracted energy features
|
83 |
+
"phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
|
84 |
+
"phone_energy_dir": "phone_energys", // directory name of extracted energy features
|
85 |
+
"uv_dir": "uvs", // directory name of extracted unvoiced features
|
86 |
+
"duration_dir": "duration", // ground-truth duration file
|
87 |
+
"phone_seq_file": "phone_seq_file", // phoneme sequence file
|
88 |
+
"file_lst": "file.lst",
|
89 |
+
"train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
|
90 |
+
"valid_file": "valid.json", // validattion set
|
91 |
+
"spk2id": "spk2id.json", // used for multi-speaker dataset
|
92 |
+
"utt2spk": "utt2spk", // used for multi-speaker dataset
|
93 |
+
"emo2id": "emo2id.json", // used for multi-emotion dataset
|
94 |
+
"utt2emo": "utt2emo", // used for multi-emotion dataset
|
95 |
+
// Features used for model training
|
96 |
+
"use_text": false,
|
97 |
+
"use_phone": false,
|
98 |
+
"use_phn_seq": false,
|
99 |
+
"use_lab": false,
|
100 |
+
"use_linear": false,
|
101 |
+
"use_mel": false,
|
102 |
+
"use_min_max_norm_mel": false,
|
103 |
+
"use_wav": false,
|
104 |
+
"use_phone_pitch": false,
|
105 |
+
"use_log_scale_pitch": false,
|
106 |
+
"use_phone_energy": false,
|
107 |
+
"use_phone_duration": false,
|
108 |
+
"use_log_scale_energy": false,
|
109 |
+
"use_wenet": false,
|
110 |
+
"use_dur": false,
|
111 |
+
"use_spkid": false, // True: use speaker id for multi-speaker dataset
|
112 |
+
"use_emoid": false, // True: use emotion id for multi-emotion dataset
|
113 |
+
"use_frame_pitch": false,
|
114 |
+
"use_uv": false,
|
115 |
+
"use_frame_energy": false,
|
116 |
+
"use_frame_duration": false,
|
117 |
+
"use_audio": false,
|
118 |
+
"use_label": false,
|
119 |
+
"use_one_hot": false,
|
120 |
+
"use_amplitude_phase": false,
|
121 |
+
"data_augment": false,
|
122 |
+
"align_mel_duration": false
|
123 |
+
},
|
124 |
+
"train": {
|
125 |
+
"ddp": true,
|
126 |
+
"random_seed": 970227,
|
127 |
+
"batch_size": 16,
|
128 |
+
"max_steps": 1000000,
|
129 |
+
// Trackers
|
130 |
+
"tracker": [
|
131 |
+
"tensorboard"
|
132 |
+
// "wandb",
|
133 |
+
// "cometml",
|
134 |
+
// "mlflow",
|
135 |
+
],
|
136 |
+
"max_epoch": -1,
|
137 |
+
// -1 means no limit
|
138 |
+
"save_checkpoint_stride": [
|
139 |
+
5,
|
140 |
+
20
|
141 |
+
],
|
142 |
+
// unit is epoch
|
143 |
+
"keep_last": [
|
144 |
+
3,
|
145 |
+
-1
|
146 |
+
],
|
147 |
+
// -1 means infinite, if one number will broadcast
|
148 |
+
"run_eval": [
|
149 |
+
false,
|
150 |
+
true
|
151 |
+
],
|
152 |
+
// if one number will broadcast
|
153 |
+
// Fix the random seed
|
154 |
+
"random_seed": 10086,
|
155 |
+
// Optimizer
|
156 |
+
"optimizer": "AdamW",
|
157 |
+
"adamw": {
|
158 |
+
"lr": 4.0e-4
|
159 |
+
// nn model lr
|
160 |
+
},
|
161 |
+
// LR Scheduler
|
162 |
+
"scheduler": "ReduceLROnPlateau",
|
163 |
+
"reducelronplateau": {
|
164 |
+
"factor": 0.8,
|
165 |
+
"patience": 10,
|
166 |
+
// unit is epoch
|
167 |
+
"min_lr": 1.0e-4
|
168 |
+
},
|
169 |
+
// Batchsampler
|
170 |
+
"sampler": {
|
171 |
+
"holistic_shuffle": true,
|
172 |
+
"drop_last": true
|
173 |
+
},
|
174 |
+
// Dataloader
|
175 |
+
"dataloader": {
|
176 |
+
"num_worker": 32,
|
177 |
+
"pin_memory": true
|
178 |
+
},
|
179 |
+
"gradient_accumulation_step": 1,
|
180 |
+
"total_training_steps": 50000,
|
181 |
+
"save_summary_steps": 500,
|
182 |
+
"save_checkpoints_steps": 10000,
|
183 |
+
"valid_interval": 10000,
|
184 |
+
"keep_checkpoint_max": 5,
|
185 |
+
"multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
|
186 |
+
"max_epoch": -1,
|
187 |
+
// -1 means no limit
|
188 |
+
"save_checkpoint_stride": [
|
189 |
+
5,
|
190 |
+
20
|
191 |
+
],
|
192 |
+
// unit is epoch
|
193 |
+
"keep_last": [
|
194 |
+
3,
|
195 |
+
-1
|
196 |
+
],
|
197 |
+
// -1 means infinite, if one number will broadcast
|
198 |
+
"run_eval": [
|
199 |
+
false,
|
200 |
+
true
|
201 |
+
],
|
202 |
+
// Batchsampler
|
203 |
+
"sampler": {
|
204 |
+
"holistic_shuffle": true,
|
205 |
+
"drop_last": true
|
206 |
+
},
|
207 |
+
// Dataloader
|
208 |
+
"dataloader": {
|
209 |
+
"num_worker": 32,
|
210 |
+
"pin_memory": true
|
211 |
+
},
|
212 |
+
// Trackers
|
213 |
+
"tracker": [
|
214 |
+
"tensorboard"
|
215 |
+
// "wandb",
|
216 |
+
// "cometml",
|
217 |
+
// "mlflow",
|
218 |
+
],
|
219 |
+
},
|
220 |
+
}
|
config/comosvc.json
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "DiffComoSVC",
|
4 |
+
"task_type": "svc",
|
5 |
+
"use_custom_dataset": false,
|
6 |
+
"preprocess": {
|
7 |
+
// data augmentations
|
8 |
+
"use_pitch_shift": false,
|
9 |
+
"use_formant_shift": false,
|
10 |
+
"use_time_stretch": false,
|
11 |
+
"use_equalizer": false,
|
12 |
+
// acoustic features
|
13 |
+
"extract_mel": true,
|
14 |
+
"mel_min_max_norm": true,
|
15 |
+
"extract_pitch": true,
|
16 |
+
"pitch_extractor": "parselmouth",
|
17 |
+
"extract_uv": true,
|
18 |
+
"extract_energy": true,
|
19 |
+
// content features
|
20 |
+
"extract_whisper_feature": false,
|
21 |
+
"whisper_sample_rate": 16000,
|
22 |
+
"extract_contentvec_feature": false,
|
23 |
+
"contentvec_sample_rate": 16000,
|
24 |
+
"extract_wenet_feature": false,
|
25 |
+
"wenet_sample_rate": 16000,
|
26 |
+
"extract_mert_feature": false,
|
27 |
+
"mert_sample_rate": 16000,
|
28 |
+
// Default config for whisper
|
29 |
+
"whisper_frameshift": 0.01,
|
30 |
+
"whisper_downsample_rate": 2,
|
31 |
+
// Default config for content vector
|
32 |
+
"contentvec_frameshift": 0.02,
|
33 |
+
// Default config for mert
|
34 |
+
"mert_model": "m-a-p/MERT-v1-330M",
|
35 |
+
"mert_feature_layer": -1,
|
36 |
+
"mert_hop_size": 320,
|
37 |
+
// 24k
|
38 |
+
"mert_frameshit": 0.01333,
|
39 |
+
// 10ms
|
40 |
+
"wenet_frameshift": 0.01,
|
41 |
+
// wenetspeech is 4, gigaspeech is 6
|
42 |
+
"wenet_downsample_rate": 4,
|
43 |
+
// Default config
|
44 |
+
"n_mel": 100,
|
45 |
+
"win_size": 1024,
|
46 |
+
// todo
|
47 |
+
"hop_size": 256,
|
48 |
+
"sample_rate": 24000,
|
49 |
+
"n_fft": 1024,
|
50 |
+
// todo
|
51 |
+
"fmin": 0,
|
52 |
+
"fmax": 12000,
|
53 |
+
// todo
|
54 |
+
"f0_min": 50,
|
55 |
+
// ~C2
|
56 |
+
"f0_max": 1100,
|
57 |
+
//1100, // ~C6(1100), ~G5(800)
|
58 |
+
"pitch_bin": 256,
|
59 |
+
"pitch_max": 1100.0,
|
60 |
+
"pitch_min": 50.0,
|
61 |
+
"is_label": true,
|
62 |
+
"is_mu_law": true,
|
63 |
+
"bits": 8,
|
64 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
65 |
+
"whisper_dir": "whisper",
|
66 |
+
"contentvec_dir": "contentvec",
|
67 |
+
"wenet_dir": "wenet",
|
68 |
+
"mert_dir": "mert",
|
69 |
+
// Extract content features using dataloader
|
70 |
+
"pin_memory": true,
|
71 |
+
"num_workers": 8,
|
72 |
+
"content_feature_batch_size": 16,
|
73 |
+
// Features used for model training
|
74 |
+
"use_mel": true,
|
75 |
+
"use_min_max_norm_mel": true,
|
76 |
+
"use_frame_pitch": true,
|
77 |
+
"use_uv": true,
|
78 |
+
"use_frame_energy": true,
|
79 |
+
"use_log_scale_pitch": false,
|
80 |
+
"use_log_scale_energy": false,
|
81 |
+
"use_spkid": true,
|
82 |
+
// Meta file
|
83 |
+
"train_file": "train.json",
|
84 |
+
"valid_file": "test.json",
|
85 |
+
"spk2id": "singers.json",
|
86 |
+
"utt2spk": "utt2singer"
|
87 |
+
},
|
88 |
+
"model": {
|
89 |
+
"teacher_model_path": "[Your Teacher Model Path].bin",
|
90 |
+
"condition_encoder": {
|
91 |
+
"merge_mode": "add",
|
92 |
+
"input_melody_dim": 1,
|
93 |
+
"use_log_f0": true,
|
94 |
+
"n_bins_melody": 256,
|
95 |
+
//# Quantization (0 for not quantization)
|
96 |
+
"output_melody_dim": 384,
|
97 |
+
"input_loudness_dim": 1,
|
98 |
+
"use_log_loudness": true,
|
99 |
+
"n_bins_loudness": 256,
|
100 |
+
"output_loudness_dim": 384,
|
101 |
+
"use_whisper": false,
|
102 |
+
"use_contentvec": false,
|
103 |
+
"use_wenet": false,
|
104 |
+
"use_mert": false,
|
105 |
+
"whisper_dim": 1024,
|
106 |
+
"contentvec_dim": 256,
|
107 |
+
"mert_dim": 256,
|
108 |
+
"wenet_dim": 512,
|
109 |
+
"content_encoder_dim": 384,
|
110 |
+
"output_singer_dim": 384,
|
111 |
+
"singer_table_size": 512,
|
112 |
+
"output_content_dim": 384,
|
113 |
+
"use_spkid": true
|
114 |
+
},
|
115 |
+
"comosvc": {
|
116 |
+
"distill": false,
|
117 |
+
// conformer encoder
|
118 |
+
"input_dim": 384,
|
119 |
+
"output_dim": 100,
|
120 |
+
"n_heads": 2,
|
121 |
+
"n_layers": 6,
|
122 |
+
"filter_channels": 512,
|
123 |
+
"dropout": 0.1,
|
124 |
+
// karras diffusion
|
125 |
+
"P_mean": -1.2,
|
126 |
+
"P_std": 1.2,
|
127 |
+
"sigma_data": 0.5,
|
128 |
+
"sigma_min": 0.002,
|
129 |
+
"sigma_max": 80,
|
130 |
+
"rho": 7,
|
131 |
+
"n_timesteps": 40,
|
132 |
+
},
|
133 |
+
"diffusion": {
|
134 |
+
// Diffusion steps encoder
|
135 |
+
"step_encoder": {
|
136 |
+
"dim_raw_embedding": 128,
|
137 |
+
"dim_hidden_layer": 512,
|
138 |
+
"activation": "SiLU",
|
139 |
+
"num_layer": 2,
|
140 |
+
"max_period": 10000
|
141 |
+
},
|
142 |
+
// Diffusion decoder
|
143 |
+
"model_type": "bidilconv",
|
144 |
+
// bidilconv, unet2d, TODO: unet1d
|
145 |
+
"bidilconv": {
|
146 |
+
"base_channel": 384,
|
147 |
+
"n_res_block": 20,
|
148 |
+
"conv_kernel_size": 3,
|
149 |
+
"dilation_cycle_length": 4,
|
150 |
+
// specially, 1 means no dilation
|
151 |
+
"conditioner_size": 100
|
152 |
+
}
|
153 |
+
},
|
154 |
+
},
|
155 |
+
"train": {
|
156 |
+
// Basic settings
|
157 |
+
"fast_steps": 0,
|
158 |
+
"batch_size": 32,
|
159 |
+
"gradient_accumulation_step": 1,
|
160 |
+
"max_epoch": -1,
|
161 |
+
// -1 means no limit
|
162 |
+
"save_checkpoint_stride": [
|
163 |
+
10,
|
164 |
+
100
|
165 |
+
],
|
166 |
+
// unit is epoch
|
167 |
+
"keep_last": [
|
168 |
+
3,
|
169 |
+
-1
|
170 |
+
],
|
171 |
+
// -1 means infinite, if one number will broadcast
|
172 |
+
"run_eval": [
|
173 |
+
false,
|
174 |
+
true
|
175 |
+
],
|
176 |
+
// if one number will broadcast
|
177 |
+
// Fix the random seed
|
178 |
+
"random_seed": 10086,
|
179 |
+
// Batchsampler
|
180 |
+
"sampler": {
|
181 |
+
"holistic_shuffle": true,
|
182 |
+
"drop_last": true
|
183 |
+
},
|
184 |
+
// Dataloader
|
185 |
+
"dataloader": {
|
186 |
+
"num_worker": 32,
|
187 |
+
"pin_memory": true
|
188 |
+
},
|
189 |
+
// Trackers
|
190 |
+
"tracker": [
|
191 |
+
"tensorboard"
|
192 |
+
// "wandb",
|
193 |
+
// "cometml",
|
194 |
+
// "mlflow",
|
195 |
+
],
|
196 |
+
// Optimizer
|
197 |
+
"optimizer": "AdamW",
|
198 |
+
"adamw": {
|
199 |
+
"lr": 4.0e-4
|
200 |
+
// nn model lr
|
201 |
+
},
|
202 |
+
// LR Scheduler
|
203 |
+
"scheduler": "ReduceLROnPlateau",
|
204 |
+
"reducelronplateau": {
|
205 |
+
"factor": 0.8,
|
206 |
+
"patience": 10,
|
207 |
+
// unit is epoch
|
208 |
+
"min_lr": 1.0e-4
|
209 |
+
}
|
210 |
+
},
|
211 |
+
"inference": {
|
212 |
+
"comosvc": {
|
213 |
+
"inference_steps": 40
|
214 |
+
}
|
215 |
+
}
|
216 |
+
}
|
config/diffusion.json
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
// FIXME: THESE ARE LEGACY
|
3 |
+
"base_config": "config/base.json",
|
4 |
+
"model_type": "diffusion",
|
5 |
+
"task_type": "svc",
|
6 |
+
"use_custom_dataset": false,
|
7 |
+
"preprocess": {
|
8 |
+
// data augmentations
|
9 |
+
"use_pitch_shift": false,
|
10 |
+
"use_formant_shift": false,
|
11 |
+
"use_time_stretch": false,
|
12 |
+
"use_equalizer": false,
|
13 |
+
// acoustic features
|
14 |
+
"extract_mel": true,
|
15 |
+
"mel_min_max_norm": true,
|
16 |
+
"extract_pitch": true,
|
17 |
+
"pitch_extractor": "parselmouth",
|
18 |
+
"extract_uv": true,
|
19 |
+
"extract_energy": true,
|
20 |
+
// content features
|
21 |
+
"extract_whisper_feature": false,
|
22 |
+
"whisper_sample_rate": 16000,
|
23 |
+
"extract_contentvec_feature": false,
|
24 |
+
"contentvec_sample_rate": 16000,
|
25 |
+
"extract_wenet_feature": false,
|
26 |
+
"wenet_sample_rate": 16000,
|
27 |
+
"extract_mert_feature": false,
|
28 |
+
"mert_sample_rate": 16000,
|
29 |
+
// Default config for whisper
|
30 |
+
"whisper_frameshift": 0.01,
|
31 |
+
"whisper_downsample_rate": 2,
|
32 |
+
// Default config for content vector
|
33 |
+
"contentvec_frameshift": 0.02,
|
34 |
+
// Default config for mert
|
35 |
+
"mert_model": "m-a-p/MERT-v1-330M",
|
36 |
+
"mert_feature_layer": -1,
|
37 |
+
"mert_hop_size": 320,
|
38 |
+
// 24k
|
39 |
+
"mert_frameshit": 0.01333,
|
40 |
+
// 10ms
|
41 |
+
"wenet_frameshift": 0.01,
|
42 |
+
// wenetspeech is 4, gigaspeech is 6
|
43 |
+
"wenet_downsample_rate": 4,
|
44 |
+
// Default config
|
45 |
+
"n_mel": 100,
|
46 |
+
"win_size": 1024,
|
47 |
+
// todo
|
48 |
+
"hop_size": 256,
|
49 |
+
"sample_rate": 24000,
|
50 |
+
"n_fft": 1024,
|
51 |
+
// todo
|
52 |
+
"fmin": 0,
|
53 |
+
"fmax": 12000,
|
54 |
+
// todo
|
55 |
+
"f0_min": 50,
|
56 |
+
// ~C2
|
57 |
+
"f0_max": 1100,
|
58 |
+
//1100, // ~C6(1100), ~G5(800)
|
59 |
+
"pitch_bin": 256,
|
60 |
+
"pitch_max": 1100.0,
|
61 |
+
"pitch_min": 50.0,
|
62 |
+
"is_label": true,
|
63 |
+
"is_mu_law": true,
|
64 |
+
"bits": 8,
|
65 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
66 |
+
"whisper_dir": "whisper",
|
67 |
+
"contentvec_dir": "contentvec",
|
68 |
+
"wenet_dir": "wenet",
|
69 |
+
"mert_dir": "mert",
|
70 |
+
// Extract content features using dataloader
|
71 |
+
"pin_memory": true,
|
72 |
+
"num_workers": 8,
|
73 |
+
"content_feature_batch_size": 16,
|
74 |
+
// Features used for model training
|
75 |
+
"use_mel": true,
|
76 |
+
"use_min_max_norm_mel": true,
|
77 |
+
"use_frame_pitch": true,
|
78 |
+
"use_uv": true,
|
79 |
+
"use_frame_energy": true,
|
80 |
+
"use_log_scale_pitch": false,
|
81 |
+
"use_log_scale_energy": false,
|
82 |
+
"use_spkid": true,
|
83 |
+
// Meta file
|
84 |
+
"train_file": "train.json",
|
85 |
+
"valid_file": "test.json",
|
86 |
+
"spk2id": "singers.json",
|
87 |
+
"utt2spk": "utt2singer"
|
88 |
+
},
|
89 |
+
"model": {
|
90 |
+
"condition_encoder": {
|
91 |
+
"merge_mode": "add",
|
92 |
+
"input_melody_dim": 1,
|
93 |
+
"use_log_f0": true,
|
94 |
+
"n_bins_melody": 256,
|
95 |
+
//# Quantization (0 for not quantization)
|
96 |
+
"output_melody_dim": 384,
|
97 |
+
"input_loudness_dim": 1,
|
98 |
+
"use_log_loudness": true,
|
99 |
+
"n_bins_loudness": 256,
|
100 |
+
"output_loudness_dim": 384,
|
101 |
+
"use_whisper": false,
|
102 |
+
"use_contentvec": false,
|
103 |
+
"use_wenet": false,
|
104 |
+
"use_mert": false,
|
105 |
+
"whisper_dim": 1024,
|
106 |
+
"contentvec_dim": 256,
|
107 |
+
"mert_dim": 256,
|
108 |
+
"wenet_dim": 512,
|
109 |
+
"content_encoder_dim": 384,
|
110 |
+
"output_singer_dim": 384,
|
111 |
+
"singer_table_size": 512,
|
112 |
+
"output_content_dim": 384,
|
113 |
+
"use_spkid": true
|
114 |
+
},
|
115 |
+
// FIXME: FOLLOWING ARE NEW!!
|
116 |
+
"diffusion": {
|
117 |
+
"scheduler": "ddpm",
|
118 |
+
"scheduler_settings": {
|
119 |
+
"num_train_timesteps": 1000,
|
120 |
+
"beta_start": 1.0e-4,
|
121 |
+
"beta_end": 0.02,
|
122 |
+
"beta_schedule": "linear"
|
123 |
+
},
|
124 |
+
// Diffusion steps encoder
|
125 |
+
"step_encoder": {
|
126 |
+
"dim_raw_embedding": 128,
|
127 |
+
"dim_hidden_layer": 512,
|
128 |
+
"activation": "SiLU",
|
129 |
+
"num_layer": 2,
|
130 |
+
"max_period": 10000
|
131 |
+
},
|
132 |
+
// Diffusion decoder
|
133 |
+
"model_type": "bidilconv",
|
134 |
+
// bidilconv, unet2d, TODO: unet1d
|
135 |
+
"bidilconv": {
|
136 |
+
"base_channel": 384,
|
137 |
+
"n_res_block": 20,
|
138 |
+
"conv_kernel_size": 3,
|
139 |
+
"dilation_cycle_length": 4,
|
140 |
+
// specially, 1 means no dilation
|
141 |
+
"conditioner_size": 384
|
142 |
+
},
|
143 |
+
"unet2d": {
|
144 |
+
"in_channels": 1,
|
145 |
+
"out_channels": 1,
|
146 |
+
"down_block_types": [
|
147 |
+
"CrossAttnDownBlock2D",
|
148 |
+
"CrossAttnDownBlock2D",
|
149 |
+
"CrossAttnDownBlock2D",
|
150 |
+
"DownBlock2D"
|
151 |
+
],
|
152 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
153 |
+
"up_block_types": [
|
154 |
+
"UpBlock2D",
|
155 |
+
"CrossAttnUpBlock2D",
|
156 |
+
"CrossAttnUpBlock2D",
|
157 |
+
"CrossAttnUpBlock2D"
|
158 |
+
],
|
159 |
+
"only_cross_attention": false
|
160 |
+
}
|
161 |
+
}
|
162 |
+
},
|
163 |
+
// FIXME: FOLLOWING ARE NEW!!
|
164 |
+
"train": {
|
165 |
+
// Basic settings
|
166 |
+
"batch_size": 64,
|
167 |
+
"gradient_accumulation_step": 1,
|
168 |
+
"max_epoch": -1,
|
169 |
+
// -1 means no limit
|
170 |
+
"save_checkpoint_stride": [
|
171 |
+
5,
|
172 |
+
20
|
173 |
+
],
|
174 |
+
// unit is epoch
|
175 |
+
"keep_last": [
|
176 |
+
3,
|
177 |
+
-1
|
178 |
+
],
|
179 |
+
// -1 means infinite, if one number will broadcast
|
180 |
+
"run_eval": [
|
181 |
+
false,
|
182 |
+
true
|
183 |
+
],
|
184 |
+
// if one number will broadcast
|
185 |
+
// Fix the random seed
|
186 |
+
"random_seed": 10086,
|
187 |
+
// Batchsampler
|
188 |
+
"sampler": {
|
189 |
+
"holistic_shuffle": true,
|
190 |
+
"drop_last": true
|
191 |
+
},
|
192 |
+
// Dataloader
|
193 |
+
"dataloader": {
|
194 |
+
"num_worker": 32,
|
195 |
+
"pin_memory": true
|
196 |
+
},
|
197 |
+
// Trackers
|
198 |
+
"tracker": [
|
199 |
+
"tensorboard"
|
200 |
+
// "wandb",
|
201 |
+
// "cometml",
|
202 |
+
// "mlflow",
|
203 |
+
],
|
204 |
+
// Optimizer
|
205 |
+
"optimizer": "AdamW",
|
206 |
+
"adamw": {
|
207 |
+
"lr": 4.0e-4
|
208 |
+
// nn model lr
|
209 |
+
},
|
210 |
+
// LR Scheduler
|
211 |
+
"scheduler": "ReduceLROnPlateau",
|
212 |
+
"reducelronplateau": {
|
213 |
+
"factor": 0.8,
|
214 |
+
"patience": 10,
|
215 |
+
// unit is epoch
|
216 |
+
"min_lr": 1.0e-4
|
217 |
+
}
|
218 |
+
},
|
219 |
+
"inference": {
|
220 |
+
"diffusion": {
|
221 |
+
"scheduler": "pndm",
|
222 |
+
"scheduler_settings": {
|
223 |
+
"num_inference_timesteps": 1000
|
224 |
+
}
|
225 |
+
}
|
226 |
+
}
|
227 |
+
}
|
config/fs2.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/tts.json",
|
3 |
+
"model_type": "FastSpeech2",
|
4 |
+
"task_type": "tts",
|
5 |
+
"dataset": ["LJSpeech"],
|
6 |
+
"preprocess": {
|
7 |
+
// acoustic features
|
8 |
+
"extract_audio": true,
|
9 |
+
"extract_mel": true,
|
10 |
+
"mel_extract_mode": "taco",
|
11 |
+
"mel_min_max_norm": false,
|
12 |
+
"extract_pitch": true,
|
13 |
+
"extract_uv": false,
|
14 |
+
"pitch_extractor": "dio",
|
15 |
+
"extract_energy": true,
|
16 |
+
"energy_extract_mode": "from_tacotron_stft",
|
17 |
+
"extract_duration": true,
|
18 |
+
"use_phone": true,
|
19 |
+
"pitch_norm": true,
|
20 |
+
"energy_norm": true,
|
21 |
+
"pitch_remove_outlier": true,
|
22 |
+
"energy_remove_outlier": true,
|
23 |
+
|
24 |
+
// Default config
|
25 |
+
"n_mel": 80,
|
26 |
+
"win_size": 1024, // todo
|
27 |
+
"hop_size": 256,
|
28 |
+
"sample_rate": 22050,
|
29 |
+
"n_fft": 1024, // todo
|
30 |
+
"fmin": 0,
|
31 |
+
"fmax": 8000, // todo
|
32 |
+
"raw_data": "raw_data",
|
33 |
+
"text_cleaners": ["english_cleaners"],
|
34 |
+
"f0_min": 71, // ~C2
|
35 |
+
"f0_max": 800, //1100, // ~C6(1100), ~G5(800)
|
36 |
+
"pitch_bin": 256,
|
37 |
+
"pitch_max": 1100.0,
|
38 |
+
"pitch_min": 50.0,
|
39 |
+
"is_label": true,
|
40 |
+
"is_mu_law": true,
|
41 |
+
"bits": 8,
|
42 |
+
|
43 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
44 |
+
"whisper_dir": "whisper",
|
45 |
+
"content_vector_dir": "content_vector",
|
46 |
+
"wenet_dir": "wenet",
|
47 |
+
"mert_dir": "mert",
|
48 |
+
"spk2id":"spk2id.json",
|
49 |
+
"utt2spk":"utt2spk",
|
50 |
+
|
51 |
+
// Features used for model training
|
52 |
+
"use_mel": true,
|
53 |
+
"use_min_max_norm_mel": false,
|
54 |
+
"use_frame_pitch": false,
|
55 |
+
"use_frame_energy": false,
|
56 |
+
"use_phone_pitch": true,
|
57 |
+
"use_phone_energy": true,
|
58 |
+
"use_log_scale_pitch": false,
|
59 |
+
"use_log_scale_energy": false,
|
60 |
+
"use_spkid": false,
|
61 |
+
"align_mel_duration": true,
|
62 |
+
"text_cleaners": ["english_cleaners"],
|
63 |
+
"phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
|
64 |
+
},
|
65 |
+
"model": {
|
66 |
+
// Settings for transformer
|
67 |
+
"transformer": {
|
68 |
+
"encoder_layer": 4,
|
69 |
+
"encoder_head": 2,
|
70 |
+
"encoder_hidden": 256,
|
71 |
+
"decoder_layer": 6,
|
72 |
+
"decoder_head": 2,
|
73 |
+
"decoder_hidden": 256,
|
74 |
+
"conv_filter_size": 1024,
|
75 |
+
"conv_kernel_size": [9, 1],
|
76 |
+
"encoder_dropout": 0.2,
|
77 |
+
"decoder_dropout": 0.2
|
78 |
+
},
|
79 |
+
|
80 |
+
// Settings for variance_predictor
|
81 |
+
"variance_predictor":{
|
82 |
+
"filter_size": 256,
|
83 |
+
"kernel_size": 3,
|
84 |
+
"dropout": 0.5
|
85 |
+
},
|
86 |
+
"variance_embedding":{
|
87 |
+
"pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
|
88 |
+
"energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
|
89 |
+
"n_bins": 256
|
90 |
+
},
|
91 |
+
"max_seq_len": 1000
|
92 |
+
},
|
93 |
+
"train":{
|
94 |
+
"batch_size": 16,
|
95 |
+
"sort_sample": true,
|
96 |
+
"drop_last": true,
|
97 |
+
"group_size": 4,
|
98 |
+
"grad_clip_thresh": 1.0,
|
99 |
+
"dataloader": {
|
100 |
+
"num_worker": 8,
|
101 |
+
"pin_memory": true
|
102 |
+
},
|
103 |
+
"lr_scheduler":{
|
104 |
+
"num_warmup": 4000
|
105 |
+
},
|
106 |
+
// LR Scheduler
|
107 |
+
"scheduler": "NoamLR",
|
108 |
+
// Optimizer
|
109 |
+
"optimizer": "Adam",
|
110 |
+
"adam": {
|
111 |
+
"lr": 0.0625,
|
112 |
+
"betas": [0.9, 0.98],
|
113 |
+
"eps": 0.000000001,
|
114 |
+
"weight_decay": 0.0
|
115 |
+
},
|
116 |
+
}
|
117 |
+
|
118 |
+
}
|
config/ns2.json
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "NaturalSpeech2",
|
4 |
+
"dataset": ["LibriTTS"],
|
5 |
+
"preprocess": {
|
6 |
+
"use_mel": false,
|
7 |
+
"use_code": true,
|
8 |
+
"use_spkid": true,
|
9 |
+
"use_pitch": true,
|
10 |
+
"use_duration": true,
|
11 |
+
"use_phone": true,
|
12 |
+
"use_len": true,
|
13 |
+
"use_cross_reference": true,
|
14 |
+
"train_file": "train.json",
|
15 |
+
"melspec_dir": "mel",
|
16 |
+
"code_dir": "code",
|
17 |
+
"pitch_dir": "pitch",
|
18 |
+
"duration_dir": "duration",
|
19 |
+
"clip_mode": "start"
|
20 |
+
},
|
21 |
+
"model": {
|
22 |
+
"latent_dim": 128,
|
23 |
+
"prior_encoder": {
|
24 |
+
"vocab_size": 100,
|
25 |
+
"pitch_min": 50,
|
26 |
+
"pitch_max": 1100,
|
27 |
+
"pitch_bins_num": 512,
|
28 |
+
"encoder": {
|
29 |
+
"encoder_layer": 6,
|
30 |
+
"encoder_hidden": 512,
|
31 |
+
"encoder_head": 8,
|
32 |
+
"conv_filter_size": 2048,
|
33 |
+
"conv_kernel_size": 9,
|
34 |
+
"encoder_dropout": 0.2,
|
35 |
+
"use_cln": true
|
36 |
+
},
|
37 |
+
"duration_predictor": {
|
38 |
+
"input_size": 512,
|
39 |
+
"filter_size": 512,
|
40 |
+
"kernel_size": 3,
|
41 |
+
"conv_layers": 30,
|
42 |
+
"cross_attn_per_layer": 3,
|
43 |
+
"attn_head": 8,
|
44 |
+
"drop_out": 0.5
|
45 |
+
},
|
46 |
+
"pitch_predictor": {
|
47 |
+
"input_size": 512,
|
48 |
+
"filter_size": 512,
|
49 |
+
"kernel_size": 5,
|
50 |
+
"conv_layers": 30,
|
51 |
+
"cross_attn_per_layer": 3,
|
52 |
+
"attn_head": 8,
|
53 |
+
"drop_out": 0.5
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"diffusion": {
|
57 |
+
"wavenet": {
|
58 |
+
"input_size": 128,
|
59 |
+
"hidden_size": 512,
|
60 |
+
"out_size": 128,
|
61 |
+
"num_layers": 40,
|
62 |
+
"cross_attn_per_layer": 3,
|
63 |
+
"dilation_cycle": 2,
|
64 |
+
"attn_head": 8,
|
65 |
+
"drop_out": 0.2
|
66 |
+
},
|
67 |
+
"beta_min": 0.05,
|
68 |
+
"beta_max": 20,
|
69 |
+
"sigma": 1.0,
|
70 |
+
"noise_factor": 1.0,
|
71 |
+
"ode_solver": "euler"
|
72 |
+
},
|
73 |
+
"prompt_encoder": {
|
74 |
+
"encoder_layer": 6,
|
75 |
+
"encoder_hidden": 512,
|
76 |
+
"encoder_head": 8,
|
77 |
+
"conv_filter_size": 2048,
|
78 |
+
"conv_kernel_size": 9,
|
79 |
+
"encoder_dropout": 0.2,
|
80 |
+
"use_cln": false
|
81 |
+
},
|
82 |
+
"query_emb": {
|
83 |
+
"query_token_num": 32,
|
84 |
+
"hidden_size": 512,
|
85 |
+
"head_num": 8
|
86 |
+
}
|
87 |
+
}
|
88 |
+
}
|
config/transformer.json
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "Transformer",
|
4 |
+
"task_type": "svc",
|
5 |
+
"use_custom_dataset": false,
|
6 |
+
"preprocess": {
|
7 |
+
// data augmentations
|
8 |
+
"use_pitch_shift": false,
|
9 |
+
"use_formant_shift": false,
|
10 |
+
"use_time_stretch": false,
|
11 |
+
"use_equalizer": false,
|
12 |
+
// acoustic features
|
13 |
+
"extract_mel": true,
|
14 |
+
"mel_min_max_norm": true,
|
15 |
+
"extract_pitch": true,
|
16 |
+
"pitch_extractor": "parselmouth",
|
17 |
+
"extract_uv": true,
|
18 |
+
"extract_energy": true,
|
19 |
+
// content features
|
20 |
+
"extract_whisper_feature": false,
|
21 |
+
"whisper_sample_rate": 16000,
|
22 |
+
"extract_contentvec_feature": false,
|
23 |
+
"contentvec_sample_rate": 16000,
|
24 |
+
"extract_wenet_feature": false,
|
25 |
+
"wenet_sample_rate": 16000,
|
26 |
+
"extract_mert_feature": false,
|
27 |
+
"mert_sample_rate": 16000,
|
28 |
+
// Default config for whisper
|
29 |
+
"whisper_frameshift": 0.01,
|
30 |
+
"whisper_downsample_rate": 2,
|
31 |
+
// Default config for content vector
|
32 |
+
"contentvec_frameshift": 0.02,
|
33 |
+
// Default config for mert
|
34 |
+
"mert_model": "m-a-p/MERT-v1-330M",
|
35 |
+
"mert_feature_layer": -1,
|
36 |
+
"mert_hop_size": 320,
|
37 |
+
// 24k
|
38 |
+
"mert_frameshit": 0.01333,
|
39 |
+
// 10ms
|
40 |
+
"wenet_frameshift": 0.01,
|
41 |
+
// wenetspeech is 4, gigaspeech is 6
|
42 |
+
"wenet_downsample_rate": 4,
|
43 |
+
// Default config
|
44 |
+
"n_mel": 100,
|
45 |
+
"win_size": 1024,
|
46 |
+
// todo
|
47 |
+
"hop_size": 256,
|
48 |
+
"sample_rate": 24000,
|
49 |
+
"n_fft": 1024,
|
50 |
+
// todo
|
51 |
+
"fmin": 0,
|
52 |
+
"fmax": 12000,
|
53 |
+
// todo
|
54 |
+
"f0_min": 50,
|
55 |
+
// ~C2
|
56 |
+
"f0_max": 1100,
|
57 |
+
//1100, // ~C6(1100), ~G5(800)
|
58 |
+
"pitch_bin": 256,
|
59 |
+
"pitch_max": 1100.0,
|
60 |
+
"pitch_min": 50.0,
|
61 |
+
"is_label": true,
|
62 |
+
"is_mu_law": true,
|
63 |
+
"bits": 8,
|
64 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
65 |
+
"whisper_dir": "whisper",
|
66 |
+
"contentvec_dir": "contentvec",
|
67 |
+
"wenet_dir": "wenet",
|
68 |
+
"mert_dir": "mert",
|
69 |
+
// Extract content features using dataloader
|
70 |
+
"pin_memory": true,
|
71 |
+
"num_workers": 8,
|
72 |
+
"content_feature_batch_size": 16,
|
73 |
+
// Features used for model training
|
74 |
+
"use_mel": true,
|
75 |
+
"use_min_max_norm_mel": true,
|
76 |
+
"use_frame_pitch": true,
|
77 |
+
"use_uv": true,
|
78 |
+
"use_frame_energy": true,
|
79 |
+
"use_log_scale_pitch": false,
|
80 |
+
"use_log_scale_energy": false,
|
81 |
+
"use_spkid": true,
|
82 |
+
// Meta file
|
83 |
+
"train_file": "train.json",
|
84 |
+
"valid_file": "test.json",
|
85 |
+
"spk2id": "singers.json",
|
86 |
+
"utt2spk": "utt2singer"
|
87 |
+
},
|
88 |
+
"model": {
|
89 |
+
"condition_encoder": {
|
90 |
+
"merge_mode": "add",
|
91 |
+
"input_melody_dim": 1,
|
92 |
+
"use_log_f0": true,
|
93 |
+
"n_bins_melody": 256,
|
94 |
+
//# Quantization (0 for not quantization)
|
95 |
+
"output_melody_dim": 384,
|
96 |
+
"input_loudness_dim": 1,
|
97 |
+
"use_log_loudness": true,
|
98 |
+
"n_bins_loudness": 256,
|
99 |
+
"output_loudness_dim": 384,
|
100 |
+
"use_whisper": false,
|
101 |
+
"use_contentvec": true,
|
102 |
+
"use_wenet": false,
|
103 |
+
"use_mert": false,
|
104 |
+
"whisper_dim": 1024,
|
105 |
+
"contentvec_dim": 256,
|
106 |
+
"mert_dim": 256,
|
107 |
+
"wenet_dim": 512,
|
108 |
+
"content_encoder_dim": 384,
|
109 |
+
"output_singer_dim": 384,
|
110 |
+
"singer_table_size": 512,
|
111 |
+
"output_content_dim": 384,
|
112 |
+
"use_spkid": true
|
113 |
+
},
|
114 |
+
"transformer": {
|
115 |
+
"type": "conformer",
|
116 |
+
// 'conformer' or 'transformer'
|
117 |
+
"input_dim": 384,
|
118 |
+
"output_dim": 100,
|
119 |
+
"n_heads": 2,
|
120 |
+
"n_layers": 6,
|
121 |
+
"filter_channels": 512,
|
122 |
+
"dropout": 0.1,
|
123 |
+
}
|
124 |
+
},
|
125 |
+
"train": {
|
126 |
+
// Basic settings
|
127 |
+
"batch_size": 64,
|
128 |
+
"gradient_accumulation_step": 1,
|
129 |
+
"max_epoch": -1,
|
130 |
+
// -1 means no limit
|
131 |
+
"save_checkpoint_stride": [
|
132 |
+
10,
|
133 |
+
100
|
134 |
+
],
|
135 |
+
// unit is epoch
|
136 |
+
"keep_last": [
|
137 |
+
3,
|
138 |
+
-1
|
139 |
+
],
|
140 |
+
// -1 means infinite, if one number will broadcast
|
141 |
+
"run_eval": [
|
142 |
+
false,
|
143 |
+
true
|
144 |
+
],
|
145 |
+
// if one number will broadcast
|
146 |
+
// Fix the random seed
|
147 |
+
"random_seed": 10086,
|
148 |
+
// Batchsampler
|
149 |
+
"sampler": {
|
150 |
+
"holistic_shuffle": true,
|
151 |
+
"drop_last": true
|
152 |
+
},
|
153 |
+
// Dataloader
|
154 |
+
"dataloader": {
|
155 |
+
"num_worker": 32,
|
156 |
+
"pin_memory": true
|
157 |
+
},
|
158 |
+
// Trackers
|
159 |
+
"tracker": [
|
160 |
+
"tensorboard"
|
161 |
+
// "wandb",
|
162 |
+
// "cometml",
|
163 |
+
// "mlflow",
|
164 |
+
],
|
165 |
+
// Optimizer
|
166 |
+
"optimizer": "AdamW",
|
167 |
+
"adamw": {
|
168 |
+
"lr": 4.0e-4
|
169 |
+
// nn model lr
|
170 |
+
},
|
171 |
+
// LR Scheduler
|
172 |
+
"scheduler": "ReduceLROnPlateau",
|
173 |
+
"reducelronplateau": {
|
174 |
+
"factor": 0.8,
|
175 |
+
"patience": 10,
|
176 |
+
// unit is epoch
|
177 |
+
"min_lr": 1.0e-4
|
178 |
+
}
|
179 |
+
}
|
180 |
+
}
|
config/tts.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"supported_model_type": [
|
4 |
+
"Fastspeech2",
|
5 |
+
"VITS",
|
6 |
+
"VALLE",
|
7 |
+
],
|
8 |
+
"task_type": "tts",
|
9 |
+
"preprocess": {
|
10 |
+
"language": "en-us",
|
11 |
+
// linguistic features
|
12 |
+
"extract_phone": true,
|
13 |
+
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
|
14 |
+
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
|
15 |
+
// Directory names of processed data or extracted features
|
16 |
+
"phone_dir": "phones",
|
17 |
+
"use_phone": true,
|
18 |
+
},
|
19 |
+
"model": {
|
20 |
+
"text_token_num": 512,
|
21 |
+
}
|
22 |
+
|
23 |
+
}
|
config/valle.json
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/tts.json",
|
3 |
+
"model_type": "VALLE",
|
4 |
+
"task_type": "tts",
|
5 |
+
"dataset": [
|
6 |
+
"libritts"
|
7 |
+
],
|
8 |
+
"preprocess": {
|
9 |
+
"extract_phone": true,
|
10 |
+
"phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
|
11 |
+
"extract_acoustic_token": true,
|
12 |
+
"acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
|
13 |
+
"acoustic_token_dir": "acoutic_tokens",
|
14 |
+
"use_text": false,
|
15 |
+
"use_phone": true,
|
16 |
+
"use_acoustic_token": true,
|
17 |
+
"symbols_dict": "symbols.dict",
|
18 |
+
"min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
|
19 |
+
"max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
|
20 |
+
"sample_rate": 24000,
|
21 |
+
"codec_hop_size": 320
|
22 |
+
},
|
23 |
+
"model": {
|
24 |
+
"text_token_num": 512,
|
25 |
+
"audio_token_num": 1024,
|
26 |
+
"decoder_dim": 1024, // embedding dimension of the decoder model
|
27 |
+
"nhead": 16, // number of attention heads in the decoder layers
|
28 |
+
"num_decoder_layers": 12, // number of decoder layers
|
29 |
+
"norm_first": true, // pre or post Normalization.
|
30 |
+
"add_prenet": false, // whether add PreNet after Inputs
|
31 |
+
"prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
|
32 |
+
"share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
|
33 |
+
"nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
|
34 |
+
"prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
|
35 |
+
"num_quantizers": 8, // numbert of the audio quantization layers
|
36 |
+
// "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
|
37 |
+
},
|
38 |
+
"train": {
|
39 |
+
"ddp": false,
|
40 |
+
"train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
|
41 |
+
"max_epoch": 20,
|
42 |
+
"optimizer": "AdamW",
|
43 |
+
"scheduler": "cosine",
|
44 |
+
"warmup_steps": 16000, // number of steps that affects how rapidly the learning rate decreases
|
45 |
+
"base_lr": 1e-4, // base learning rate."
|
46 |
+
"valid_interval": 1000,
|
47 |
+
"log_epoch_step": 1000,
|
48 |
+
"save_checkpoint_stride": [
|
49 |
+
1,
|
50 |
+
1
|
51 |
+
]
|
52 |
+
}
|
53 |
+
}
|
config/vits.json
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/tts.json",
|
3 |
+
"model_type": "VITS",
|
4 |
+
"task_type": "tts",
|
5 |
+
"preprocess": {
|
6 |
+
"extract_phone": true,
|
7 |
+
"extract_mel": true,
|
8 |
+
"n_mel": 80,
|
9 |
+
"fmin": 0,
|
10 |
+
"fmax": null,
|
11 |
+
"extract_linear_spec": true,
|
12 |
+
"extract_audio": true,
|
13 |
+
"use_linear": true,
|
14 |
+
"use_mel": true,
|
15 |
+
"use_audio": true,
|
16 |
+
"use_text": false,
|
17 |
+
"use_phone": true,
|
18 |
+
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
|
19 |
+
"n_fft": 1024,
|
20 |
+
"win_size": 1024,
|
21 |
+
"hop_size": 256,
|
22 |
+
"segment_size": 8192,
|
23 |
+
"text_cleaners": [
|
24 |
+
"english_cleaners"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
"model": {
|
28 |
+
"text_token_num": 512,
|
29 |
+
"inter_channels": 192,
|
30 |
+
"hidden_channels": 192,
|
31 |
+
"filter_channels": 768,
|
32 |
+
"n_heads": 2,
|
33 |
+
"n_layers": 6,
|
34 |
+
"kernel_size": 3,
|
35 |
+
"p_dropout": 0.1,
|
36 |
+
"resblock": "1",
|
37 |
+
"resblock_kernel_sizes": [
|
38 |
+
3,
|
39 |
+
7,
|
40 |
+
11
|
41 |
+
],
|
42 |
+
"resblock_dilation_sizes": [
|
43 |
+
[
|
44 |
+
1,
|
45 |
+
3,
|
46 |
+
5
|
47 |
+
],
|
48 |
+
[
|
49 |
+
1,
|
50 |
+
3,
|
51 |
+
5
|
52 |
+
],
|
53 |
+
[
|
54 |
+
1,
|
55 |
+
3,
|
56 |
+
5
|
57 |
+
]
|
58 |
+
],
|
59 |
+
"upsample_rates": [
|
60 |
+
8,
|
61 |
+
8,
|
62 |
+
2,
|
63 |
+
2
|
64 |
+
],
|
65 |
+
"upsample_initial_channel": 512,
|
66 |
+
"upsample_kernel_sizes": [
|
67 |
+
16,
|
68 |
+
16,
|
69 |
+
4,
|
70 |
+
4
|
71 |
+
],
|
72 |
+
"n_layers_q": 3,
|
73 |
+
"use_spectral_norm": false,
|
74 |
+
"n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
|
75 |
+
"gin_channels": 256,
|
76 |
+
"use_sdp": true
|
77 |
+
},
|
78 |
+
"train": {
|
79 |
+
"fp16_run": true,
|
80 |
+
"learning_rate": 2e-4,
|
81 |
+
"betas": [
|
82 |
+
0.8,
|
83 |
+
0.99
|
84 |
+
],
|
85 |
+
"eps": 1e-9,
|
86 |
+
"batch_size": 16,
|
87 |
+
"lr_decay": 0.999875,
|
88 |
+
// "segment_size": 8192,
|
89 |
+
"init_lr_ratio": 1,
|
90 |
+
"warmup_epochs": 0,
|
91 |
+
"c_mel": 45,
|
92 |
+
"c_kl": 1.0,
|
93 |
+
"AdamW": {
|
94 |
+
"betas": [
|
95 |
+
0.8,
|
96 |
+
0.99
|
97 |
+
],
|
98 |
+
"eps": 1e-9,
|
99 |
+
}
|
100 |
+
}
|
101 |
+
}
|
config/vitssvc.json
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"model_type": "VITS",
|
4 |
+
"task_type": "svc",
|
5 |
+
"preprocess": {
|
6 |
+
"extract_phone": false,
|
7 |
+
"extract_mel": true,
|
8 |
+
"extract_linear_spec": true,
|
9 |
+
"extract_audio": true,
|
10 |
+
"use_linear": true,
|
11 |
+
"use_mel": true,
|
12 |
+
"use_audio": true,
|
13 |
+
"use_text": false,
|
14 |
+
"use_phone": true,
|
15 |
+
|
16 |
+
"fmin": 0,
|
17 |
+
"fmax": null,
|
18 |
+
"f0_min": 50,
|
19 |
+
"f0_max": 1100,
|
20 |
+
// f0_bin in sovits
|
21 |
+
"pitch_bin": 256,
|
22 |
+
// filter_length in sovits
|
23 |
+
"n_fft": 2048,
|
24 |
+
// hop_length in sovits
|
25 |
+
"hop_size": 512,
|
26 |
+
// win_length in sovits
|
27 |
+
"win_size": 2048,
|
28 |
+
"segment_size": 8192,
|
29 |
+
"n_mel": 100,
|
30 |
+
"sample_rate": 44100,
|
31 |
+
|
32 |
+
"mel_min_max_stats_dir": "mel_min_max_stats",
|
33 |
+
"whisper_dir": "whisper",
|
34 |
+
"contentvec_dir": "contentvec",
|
35 |
+
"wenet_dir": "wenet",
|
36 |
+
"mert_dir": "mert",
|
37 |
+
},
|
38 |
+
"model": {
|
39 |
+
"condition_encoder": {
|
40 |
+
"merge_mode": "add",
|
41 |
+
"input_melody_dim": 1,
|
42 |
+
"use_log_f0": true,
|
43 |
+
"n_bins_melody": 256,
|
44 |
+
//# Quantization (0 for not quantization)
|
45 |
+
"output_melody_dim": 196,
|
46 |
+
"input_loudness_dim": 1,
|
47 |
+
"use_log_loudness": false,
|
48 |
+
"n_bins_loudness": 256,
|
49 |
+
"output_loudness_dim": 196,
|
50 |
+
"use_whisper": false,
|
51 |
+
"use_contentvec": false,
|
52 |
+
"use_wenet": false,
|
53 |
+
"use_mert": false,
|
54 |
+
"whisper_dim": 1024,
|
55 |
+
"contentvec_dim": 256,
|
56 |
+
"mert_dim": 256,
|
57 |
+
"wenet_dim": 512,
|
58 |
+
"content_encoder_dim": 196,
|
59 |
+
"output_singer_dim": 196,
|
60 |
+
"singer_table_size": 512,
|
61 |
+
"output_content_dim": 196,
|
62 |
+
"use_spkid": true
|
63 |
+
},
|
64 |
+
"vits": {
|
65 |
+
"filter_channels": 256,
|
66 |
+
"gin_channels": 256,
|
67 |
+
"hidden_channels": 192,
|
68 |
+
"inter_channels": 192,
|
69 |
+
"kernel_size": 3,
|
70 |
+
"n_flow_layer": 4,
|
71 |
+
"n_heads": 2,
|
72 |
+
"n_layers": 6,
|
73 |
+
"n_layers_q": 3,
|
74 |
+
"n_speakers": 512,
|
75 |
+
"p_dropout": 0.1,
|
76 |
+
"ssl_dim": 256,
|
77 |
+
"use_spectral_norm": false,
|
78 |
+
},
|
79 |
+
"generator": "hifigan",
|
80 |
+
"generator_config": {
|
81 |
+
"hifigan": {
|
82 |
+
"resblock": "1",
|
83 |
+
"resblock_kernel_sizes": [
|
84 |
+
3,
|
85 |
+
7,
|
86 |
+
11
|
87 |
+
],
|
88 |
+
"upsample_rates": [
|
89 |
+
8,8,2,2,2
|
90 |
+
],
|
91 |
+
"upsample_kernel_sizes": [
|
92 |
+
16,16,4,4,4
|
93 |
+
],
|
94 |
+
"upsample_initial_channel": 512,
|
95 |
+
"resblock_dilation_sizes": [
|
96 |
+
[1,3,5],
|
97 |
+
[1,3,5],
|
98 |
+
[1,3,5]
|
99 |
+
]
|
100 |
+
},
|
101 |
+
"melgan": {
|
102 |
+
"ratios": [8, 8, 2, 2, 2],
|
103 |
+
"ngf": 32,
|
104 |
+
"n_residual_layers": 3,
|
105 |
+
"num_D": 3,
|
106 |
+
"ndf": 16,
|
107 |
+
"n_layers": 4,
|
108 |
+
"downsampling_factor": 4
|
109 |
+
},
|
110 |
+
"bigvgan": {
|
111 |
+
"resblock": "1",
|
112 |
+
"activation": "snakebeta",
|
113 |
+
"snake_logscale": true,
|
114 |
+
"upsample_rates": [
|
115 |
+
8,8,2,2,2,
|
116 |
+
],
|
117 |
+
"upsample_kernel_sizes": [
|
118 |
+
16,16,4,4,4,
|
119 |
+
],
|
120 |
+
"upsample_initial_channel": 512,
|
121 |
+
"resblock_kernel_sizes": [
|
122 |
+
3,
|
123 |
+
7,
|
124 |
+
11
|
125 |
+
],
|
126 |
+
"resblock_dilation_sizes": [
|
127 |
+
[1,3,5],
|
128 |
+
[1,3,5],
|
129 |
+
[1,3,5]
|
130 |
+
]
|
131 |
+
},
|
132 |
+
"nsfhifigan": {
|
133 |
+
"resblock": "1",
|
134 |
+
"harmonic_num": 8,
|
135 |
+
"upsample_rates": [
|
136 |
+
8,8,2,2,2,
|
137 |
+
],
|
138 |
+
"upsample_kernel_sizes": [
|
139 |
+
16,16,4,4,4,
|
140 |
+
],
|
141 |
+
"upsample_initial_channel": 768,
|
142 |
+
"resblock_kernel_sizes": [
|
143 |
+
3,
|
144 |
+
7,
|
145 |
+
11
|
146 |
+
],
|
147 |
+
"resblock_dilation_sizes": [
|
148 |
+
[1,3,5],
|
149 |
+
[1,3,5],
|
150 |
+
[1,3,5]
|
151 |
+
]
|
152 |
+
},
|
153 |
+
"apnet": {
|
154 |
+
"ASP_channel": 512,
|
155 |
+
"ASP_resblock_kernel_sizes": [3,7,11],
|
156 |
+
"ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
157 |
+
"ASP_input_conv_kernel_size": 7,
|
158 |
+
"ASP_output_conv_kernel_size": 7,
|
159 |
+
|
160 |
+
"PSP_channel": 512,
|
161 |
+
"PSP_resblock_kernel_sizes": [3,7,11],
|
162 |
+
"PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
163 |
+
"PSP_input_conv_kernel_size": 7,
|
164 |
+
"PSP_output_R_conv_kernel_size": 7,
|
165 |
+
"PSP_output_I_conv_kernel_size": 7,
|
166 |
+
}
|
167 |
+
},
|
168 |
+
},
|
169 |
+
"train": {
|
170 |
+
"fp16_run": true,
|
171 |
+
"learning_rate": 2e-4,
|
172 |
+
"betas": [
|
173 |
+
0.8,
|
174 |
+
0.99
|
175 |
+
],
|
176 |
+
"eps": 1e-9,
|
177 |
+
"batch_size": 16,
|
178 |
+
"lr_decay": 0.999875,
|
179 |
+
// "segment_size": 8192,
|
180 |
+
"init_lr_ratio": 1,
|
181 |
+
"warmup_epochs": 0,
|
182 |
+
"c_mel": 45,
|
183 |
+
"c_kl": 1.0,
|
184 |
+
"AdamW": {
|
185 |
+
"betas": [
|
186 |
+
0.8,
|
187 |
+
0.99
|
188 |
+
],
|
189 |
+
"eps": 1e-9,
|
190 |
+
}
|
191 |
+
}
|
192 |
+
}
|
config/vocoder.json
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/base.json",
|
3 |
+
"dataset": [
|
4 |
+
"LJSpeech",
|
5 |
+
"LibriTTS",
|
6 |
+
"opencpop",
|
7 |
+
"m4singer",
|
8 |
+
"svcc",
|
9 |
+
"svcceval",
|
10 |
+
"pjs",
|
11 |
+
"opensinger",
|
12 |
+
"popbutfy",
|
13 |
+
"nus48e",
|
14 |
+
"popcs",
|
15 |
+
"kising",
|
16 |
+
"csd",
|
17 |
+
"opera",
|
18 |
+
"vctk",
|
19 |
+
"lijian",
|
20 |
+
"cdmusiceval"
|
21 |
+
],
|
22 |
+
"task_type": "vocoder",
|
23 |
+
"preprocess": {
|
24 |
+
// acoustic features
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": false,
|
27 |
+
"extract_uv": false,
|
28 |
+
"extract_audio": true,
|
29 |
+
"extract_label": false,
|
30 |
+
"extract_one_hot": false,
|
31 |
+
"extract_amplitude_phase": false,
|
32 |
+
"pitch_extractor": "parselmouth",
|
33 |
+
// Settings for data preprocessing
|
34 |
+
"n_mel": 100,
|
35 |
+
"win_size": 1024,
|
36 |
+
"hop_size": 256,
|
37 |
+
"sample_rate": 24000,
|
38 |
+
"n_fft": 1024,
|
39 |
+
"fmin": 0,
|
40 |
+
"fmax": 12000,
|
41 |
+
"f0_min": 50,
|
42 |
+
"f0_max": 1100,
|
43 |
+
"pitch_bin": 256,
|
44 |
+
"pitch_max": 1100.0,
|
45 |
+
"pitch_min": 50.0,
|
46 |
+
"is_mu_law": false,
|
47 |
+
"bits": 8,
|
48 |
+
"cut_mel_frame": 32,
|
49 |
+
// Directory names of processed data or extracted features
|
50 |
+
"spk2id": "singers.json",
|
51 |
+
// Features used for model training
|
52 |
+
"use_mel": true,
|
53 |
+
"use_frame_pitch": false,
|
54 |
+
"use_uv": false,
|
55 |
+
"use_audio": true,
|
56 |
+
"use_label": false,
|
57 |
+
"use_one_hot": false,
|
58 |
+
"train_file": "train.json",
|
59 |
+
"valid_file": "test.json"
|
60 |
+
},
|
61 |
+
"train": {
|
62 |
+
"random_seed": 114514,
|
63 |
+
"batch_size": 64,
|
64 |
+
"gradient_accumulation_step": 1,
|
65 |
+
"max_epoch": 1000000,
|
66 |
+
"save_checkpoint_stride": [
|
67 |
+
20
|
68 |
+
],
|
69 |
+
"run_eval": [
|
70 |
+
true
|
71 |
+
],
|
72 |
+
"sampler": {
|
73 |
+
"holistic_shuffle": true,
|
74 |
+
"drop_last": true
|
75 |
+
},
|
76 |
+
"dataloader": {
|
77 |
+
"num_worker": 4,
|
78 |
+
"pin_memory": true
|
79 |
+
},
|
80 |
+
"tracker": [
|
81 |
+
"tensorboard"
|
82 |
+
],
|
83 |
+
}
|
84 |
+
}
|
egs/datasets/README.md
ADDED
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Datasets Format
|
2 |
+
|
3 |
+
Amphion support the following academic datasets (sort alphabetically):
|
4 |
+
|
5 |
+
- [Datasets Format](#datasets-format)
|
6 |
+
- [AudioCaps](#audiocaps)
|
7 |
+
- [CSD](#csd)
|
8 |
+
- [KiSing](#kising)
|
9 |
+
- [LibriTTS](#libritts)
|
10 |
+
- [LJSpeech](#ljspeech)
|
11 |
+
- [M4Singer](#m4singer)
|
12 |
+
- [NUS-48E](#nus-48e)
|
13 |
+
- [Opencpop](#opencpop)
|
14 |
+
- [OpenSinger](#opensinger)
|
15 |
+
- [Opera](#opera)
|
16 |
+
- [PopBuTFy](#popbutfy)
|
17 |
+
- [PopCS](#popcs)
|
18 |
+
- [PJS](#pjs)
|
19 |
+
- [SVCC](#svcc)
|
20 |
+
- [VCTK](#vctk)
|
21 |
+
|
22 |
+
The downloading link and the file structure tree of each dataset is displayed as follows.
|
23 |
+
|
24 |
+
## AudioCaps
|
25 |
+
|
26 |
+
AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps). The file structure tree is like:
|
27 |
+
|
28 |
+
```plaintext
|
29 |
+
[AudioCaps dataset path]
|
30 |
+
┣ AudioCpas
|
31 |
+
┃ ┣ wav
|
32 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
|
33 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
|
34 |
+
┃ ┃ ┣ ...
|
35 |
+
```
|
36 |
+
|
37 |
+
## CSD
|
38 |
+
|
39 |
+
The official CSD dataset can be download [here](https://zenodo.org/records/4785016). The file structure tree is like:
|
40 |
+
|
41 |
+
```plaintext
|
42 |
+
[CSD dataset path]
|
43 |
+
┣ english
|
44 |
+
┣ korean
|
45 |
+
┣ utterances
|
46 |
+
┃ ┣ en001a
|
47 |
+
┃ ┃ ┣ {UtterenceID}.wav
|
48 |
+
┃ ┣ en001b
|
49 |
+
┃ ┣ en002a
|
50 |
+
┃ ┣ en002b
|
51 |
+
┃ ┣ ...
|
52 |
+
┣ README
|
53 |
+
```
|
54 |
+
|
55 |
+
## KiSing
|
56 |
+
|
57 |
+
The official KiSing dataset can be download [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure tree is like:
|
58 |
+
|
59 |
+
```plaintext
|
60 |
+
[KiSing dataset path]
|
61 |
+
┣ clean
|
62 |
+
┃ ┣ 421
|
63 |
+
┃ ┣ 422
|
64 |
+
┃ ┣ ...
|
65 |
+
```
|
66 |
+
|
67 |
+
## LibriTTS
|
68 |
+
|
69 |
+
The official LibriTTS dataset can be download [here](https://www.openslr.org/60/). The file structure tree is like:
|
70 |
+
|
71 |
+
```plaintext
|
72 |
+
[LibriTTS dataset path]
|
73 |
+
┣ BOOKS.txt
|
74 |
+
┣ CHAPTERS.txt
|
75 |
+
┣ eval_sentences10.tsv
|
76 |
+
┣ LICENSE.txt
|
77 |
+
┣ NOTE.txt
|
78 |
+
┣ reader_book.tsv
|
79 |
+
┣ README_librispeech.txt
|
80 |
+
┣ README_libritts.txt
|
81 |
+
┣ speakers.tsv
|
82 |
+
┣ SPEAKERS.txt
|
83 |
+
┣ dev-clean (Subset)
|
84 |
+
┃ ┣ 1272{Speaker_ID}
|
85 |
+
┃ ┃ ┣ 128104 {Chapter_ID}
|
86 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt
|
87 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt
|
88 |
+
┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav
|
89 |
+
┃ ┃ ┃ ┣ ...
|
90 |
+
┃ ┃ ┃ ┣ 1272_128104.book.tsv
|
91 |
+
┃ ┃ ┃ ┣ 1272_128104.trans.tsv
|
92 |
+
┃ ┃ ┣ ...
|
93 |
+
┃ ┣ ...
|
94 |
+
┣ dev-other (Subset)
|
95 |
+
┃ ┣ 116 (Speaker)
|
96 |
+
┃ ┃ ┣ 288045 {Chapter_ID}
|
97 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt
|
98 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt
|
99 |
+
┃ ┃ ┃ ┣ 116_288045_000003_000000.wav
|
100 |
+
┃ ┃ ┃ ┣ ...
|
101 |
+
┃ ┃ ┃ ┣ 116_288045.book.tsv
|
102 |
+
┃ ┃ ┃ ┣ 116_288045.trans.tsv
|
103 |
+
┃ ┃ ┣ ...
|
104 |
+
┃ ┣ ...
|
105 |
+
┃ ┣ ...
|
106 |
+
┣ test-clean (Subset)
|
107 |
+
┃ ┣ {Speaker_ID}
|
108 |
+
┃ ┃ ┣ {Chapter_ID}
|
109 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
110 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
111 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
112 |
+
┃ ┃ ┃ ┣ ...
|
113 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
114 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
115 |
+
┃ ┃ ┣ ...
|
116 |
+
┃ ┣ ...
|
117 |
+
┣ test-other
|
118 |
+
┃ ┣ {Speaker_ID}
|
119 |
+
┃ ┃ ┣ {Chapter_ID}
|
120 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
121 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
122 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
123 |
+
┃ ┃ ┃ ┣ ...
|
124 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
125 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
126 |
+
┃ ┃ ┣ ...
|
127 |
+
┃ ┣ ...
|
128 |
+
┣ train-clean-100
|
129 |
+
┃ ┣ {Speaker_ID}
|
130 |
+
┃ ┃ ┣ {Chapter_ID}
|
131 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
132 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
133 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
134 |
+
┃ ┃ ┃ ┣ ...
|
135 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
136 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
137 |
+
┃ ┃ ┣ ...
|
138 |
+
┃ ┣ ...
|
139 |
+
┣ train-clean-360
|
140 |
+
┃ ┣ {Speaker_ID}
|
141 |
+
┃ ┃ ┣ {Chapter_ID}
|
142 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
143 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
144 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
145 |
+
┃ ┃ ┃ ┣ ...
|
146 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
147 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
148 |
+
┃ ┃ ┣ ...
|
149 |
+
┃ ┣ ...
|
150 |
+
┣ train-other-500
|
151 |
+
┃ ┣ {Speaker_ID}
|
152 |
+
┃ ┃ ┣ {Chapter_ID}
|
153 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
|
154 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
|
155 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
|
156 |
+
┃ ┃ ┃ ┣ ...
|
157 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
|
158 |
+
┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
|
159 |
+
┃ ┃ ┣ ...
|
160 |
+
┃ ┣ ...
|
161 |
+
```
|
162 |
+
|
163 |
+
|
164 |
+
## LJSpeech
|
165 |
+
|
166 |
+
The official LJSpeech dataset can be download [here](https://keithito.com/LJ-Speech-Dataset/). The file structure tree is like:
|
167 |
+
|
168 |
+
```plaintext
|
169 |
+
[LJSpeech dataset path]
|
170 |
+
┣ metadata.csv
|
171 |
+
┣ wavs
|
172 |
+
┃ ┣ LJ001-0001.wav
|
173 |
+
┃ ┣ LJ001-0002.wav
|
174 |
+
┃ ┣ ...
|
175 |
+
┣ README
|
176 |
+
```
|
177 |
+
|
178 |
+
## M4Singer
|
179 |
+
|
180 |
+
The official M4Singer dataset can be downloaded [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure tree is like:
|
181 |
+
|
182 |
+
```plaintext
|
183 |
+
[M4Singer dataset path]
|
184 |
+
┣ {Singer_1}#{Song_1}
|
185 |
+
┃ ┣ 0000.mid
|
186 |
+
┃ ┣ 0000.TextGrid
|
187 |
+
┃ ┣ 0000.wav
|
188 |
+
┃ ┣ ...
|
189 |
+
┣ {Singer_1}#{Song_2}
|
190 |
+
┣ ...
|
191 |
+
┣ {Singer_2}#{Song_1}
|
192 |
+
┣ {Singer_2}#{Song_2}
|
193 |
+
┣ ...
|
194 |
+
┗ meta.json
|
195 |
+
```
|
196 |
+
|
197 |
+
## NUS-48E
|
198 |
+
|
199 |
+
The official NUS-48E dataset can be download [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure tree is like:
|
200 |
+
|
201 |
+
```plaintext
|
202 |
+
[NUS-48E dataset path]
|
203 |
+
┣ {SpeakerID}
|
204 |
+
┃ ┣ read
|
205 |
+
┃ ┃ ┣ {SongID}.txt
|
206 |
+
┃ ┃ ┣ {SongID}.wav
|
207 |
+
┃ ┃ ┣ ...
|
208 |
+
┃ ┣ sing
|
209 |
+
┃ ┃ ┣ {SongID}.txt
|
210 |
+
┃ ┃ ┣ {SongID}.wav
|
211 |
+
┃ ┃ ┣ ...
|
212 |
+
┣ ...
|
213 |
+
┣ README.txt
|
214 |
+
|
215 |
+
```
|
216 |
+
|
217 |
+
## Opencpop
|
218 |
+
|
219 |
+
The official Opera dataset can be downloaded [here](https://wenet.org.cn/opencpop/). The file structure tree is like:
|
220 |
+
|
221 |
+
```plaintext
|
222 |
+
[Opencpop dataset path]
|
223 |
+
┣ midis
|
224 |
+
┃ ┣ 2001.midi
|
225 |
+
┃ ┣ 2002.midi
|
226 |
+
┃ ┣ 2003.midi
|
227 |
+
┃ ┣ ...
|
228 |
+
┣ segments
|
229 |
+
┃ ┣ wavs
|
230 |
+
┃ ┃ ┣ 2001000001.wav
|
231 |
+
┃ ┃ ┣ 2001000002.wav
|
232 |
+
┃ ┃ ┣ 2001000003.wav
|
233 |
+
┃ ┃ ┣ ...
|
234 |
+
┃ ┣ test.txt
|
235 |
+
┃ ┣ train.txt
|
236 |
+
┃ ┗ transcriptions.txt
|
237 |
+
┣ textgrids
|
238 |
+
┃ ┣ 2001.TextGrid
|
239 |
+
┃ ┣ 2002.TextGrid
|
240 |
+
┃ ┣ 2003.TextGrid
|
241 |
+
┃ ┣ ...
|
242 |
+
┣ wavs
|
243 |
+
┃ ┣ 2001.wav
|
244 |
+
┃ ┣ 2002.wav
|
245 |
+
┃ ┣ 2003.wav
|
246 |
+
┃ ┣ ...
|
247 |
+
┣ TERMS_OF_ACCESS
|
248 |
+
┗ readme.md
|
249 |
+
```
|
250 |
+
|
251 |
+
## OpenSinger
|
252 |
+
|
253 |
+
The official OpenSinger dataset can be downloaded [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure tree is like:
|
254 |
+
|
255 |
+
```plaintext
|
256 |
+
[OpenSinger dataset path]
|
257 |
+
┣ ManRaw
|
258 |
+
┃ ┣ {Singer_1}_{Song_1}
|
259 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab
|
260 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt
|
261 |
+
┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav
|
262 |
+
┃ ┃ ┣ ...
|
263 |
+
┃ ┣ {Singer_1}_{Song_2}
|
264 |
+
┃ ┣ ...
|
265 |
+
┣ WomanRaw
|
266 |
+
┣ LICENSE
|
267 |
+
┗ README.md
|
268 |
+
```
|
269 |
+
|
270 |
+
## Opera
|
271 |
+
|
272 |
+
The official Opera dataset can be downloaded [here](http://isophonics.net/SingingVoiceDataset). The file structure tree is like:
|
273 |
+
|
274 |
+
```plaintext
|
275 |
+
[Opera dataset path]
|
276 |
+
┣ monophonic
|
277 |
+
┃ ┣ chinese
|
278 |
+
┃ ┃ ┣ {Gender}_{SingerID}
|
279 |
+
┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav
|
280 |
+
┃ ┃ ┃ ┣ ...
|
281 |
+
┃ ┃ ┣ ...
|
282 |
+
┃ ┣ western
|
283 |
+
┣ polyphonic
|
284 |
+
┃ ┣ chinese
|
285 |
+
┃ ┣ western
|
286 |
+
┣ CrossculturalDataSet.xlsx
|
287 |
+
```
|
288 |
+
|
289 |
+
## PopBuTFy
|
290 |
+
|
291 |
+
The official PopBuTFy dataset can be downloaded [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure tree is like:
|
292 |
+
|
293 |
+
```plaintext
|
294 |
+
[PopBuTFy dataset path]
|
295 |
+
┣ data
|
296 |
+
┃ ┣ {SingerID}#singing#{SongName}_Amateur
|
297 |
+
┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3
|
298 |
+
┃ ┃ ┣ ...
|
299 |
+
┃ ┣ {SingerID}#singing#{SongName}_Professional
|
300 |
+
┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3
|
301 |
+
┃ ┃ ┣ ...
|
302 |
+
┣ text_labels
|
303 |
+
┗ TERMS_OF_ACCESS
|
304 |
+
```
|
305 |
+
|
306 |
+
## PopCS
|
307 |
+
|
308 |
+
The official PopCS dataset can be downloaded [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure tree is like:
|
309 |
+
|
310 |
+
```plaintext
|
311 |
+
[PopCS dataset path]
|
312 |
+
┣ popcs
|
313 |
+
┃ ┣ popcs-{SongName}
|
314 |
+
┃ ┃ ┣ {UtteranceID}_ph.txt
|
315 |
+
┃ ┃ ┣ {UtteranceID}_wf0.wav
|
316 |
+
┃ ┃ ┣ {UtteranceID}.TextGrid
|
317 |
+
┃ ┃ ┣ {UtteranceID}.txt
|
318 |
+
┃ ┃ ┣ ...
|
319 |
+
┃ ┣ ...
|
320 |
+
┗ TERMS_OF_ACCESS
|
321 |
+
```
|
322 |
+
|
323 |
+
## PJS
|
324 |
+
|
325 |
+
The official PJS dataset can be downloaded [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure tree is like:
|
326 |
+
|
327 |
+
```plaintext
|
328 |
+
[PJS dataset path]
|
329 |
+
┣ PJS_corpus_ver1.1
|
330 |
+
┃ ┣ background_noise
|
331 |
+
┃ ┣ pjs{SongID}
|
332 |
+
┃ ┃ ┣ pjs{SongID}_song.wav
|
333 |
+
┃ ┃ ┣ pjs{SongID}_speech.wav
|
334 |
+
┃ ┃ ┣ pjs{SongID}.lab
|
335 |
+
┃ ┃ ┣ pjs{SongID}.mid
|
336 |
+
┃ ┃ ┣ pjs{SongID}.musicxml
|
337 |
+
┃ ┃ ┣ pjs{SongID}.txt
|
338 |
+
┃ ┣ ...
|
339 |
+
```
|
340 |
+
|
341 |
+
## SVCC
|
342 |
+
|
343 |
+
The official SVCC dataset can be downloaded [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure tree is like:
|
344 |
+
|
345 |
+
```plaintext
|
346 |
+
[SVCC dataset path]
|
347 |
+
┣ Data
|
348 |
+
┃ ┣ CDF1
|
349 |
+
┃ ┃ ┣ 10001.wav
|
350 |
+
┃ ┃ ┣ 10002.wav
|
351 |
+
┃ ┃ ┣ ...
|
352 |
+
┃ ┣ CDM1
|
353 |
+
┃ ┣ IDF1
|
354 |
+
┃ ┣ IDM1
|
355 |
+
┗ README.md
|
356 |
+
```
|
357 |
+
|
358 |
+
## VCTK
|
359 |
+
|
360 |
+
The official VCTK dataset can be downloaded [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure tree is like:
|
361 |
+
|
362 |
+
```plaintext
|
363 |
+
[VCTK dataset path]
|
364 |
+
┣ txt
|
365 |
+
┃ ┣ {Speaker_1}
|
366 |
+
┃ ┃ ┣ {Speaker_1}_001.txt
|
367 |
+
┃ ┃ ┣ {Speaker_1}_002.txt
|
368 |
+
┃ ┃ ┣ ...
|
369 |
+
┃ ┣ {Speaker_2}
|
370 |
+
┃ ┣ ...
|
371 |
+
┣ wav48_silence_trimmed
|
372 |
+
┃ ┣ {Speaker_1}
|
373 |
+
┃ ┃ ┣ {Speaker_1}_001_mic1.flac
|
374 |
+
┃ ┃ ┣ {Speaker_1}_001_mic2.flac
|
375 |
+
┃ ┃ ┣ {Speaker_1}_002_mic1.flac
|
376 |
+
┃ ┃ ┣ ...
|
377 |
+
┃ ┣ {Speaker_2}
|
378 |
+
┃ ┣ ...
|
379 |
+
┣ speaker-info.txt
|
380 |
+
┗ update.txt
|
381 |
+
```
|
egs/metrics/README.md
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Amphion Evaluation Recipe
|
2 |
+
|
3 |
+
## Supported Evaluation Metrics
|
4 |
+
|
5 |
+
Until now, Amphion Evaluation has supported the following objective metrics:
|
6 |
+
|
7 |
+
- **F0 Modeling**:
|
8 |
+
- F0 Pearson Coefficients (FPC)
|
9 |
+
- F0 Periodicity Root Mean Square Error (PeriodicityRMSE)
|
10 |
+
- F0 Root Mean Square Error (F0RMSE)
|
11 |
+
- Voiced/Unvoiced F1 Score (V/UV F1)
|
12 |
+
- **Energy Modeling**:
|
13 |
+
- Energy Root Mean Square Error (EnergyRMSE)
|
14 |
+
- Energy Pearson Coefficients (EnergyPC)
|
15 |
+
- **Intelligibility**:
|
16 |
+
- Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper)
|
17 |
+
- Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper)
|
18 |
+
- **Spectrogram Distortion**:
|
19 |
+
- Frechet Audio Distance (FAD)
|
20 |
+
- Mel Cepstral Distortion (MCD)
|
21 |
+
- Multi-Resolution STFT Distance (MSTFT)
|
22 |
+
- Perceptual Evaluation of Speech Quality (PESQ)
|
23 |
+
- Short Time Objective Intelligibility (STOI)
|
24 |
+
- Scale Invariant Signal to Distortion Ratio (SISDR)
|
25 |
+
- Scale Invariant Signal to Noise Ratio (SISNR)
|
26 |
+
- **Speaker Similarity**:
|
27 |
+
- Cosine similarity based on [Rawnet3](https://github.com/Jungjee/RawNet)
|
28 |
+
- Cosine similarity based on [WeSpeaker](https://github.com/wenet-e2e/wespeaker) (👨💻 developing)
|
29 |
+
|
30 |
+
We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total:
|
31 |
+
|
32 |
+
1. Pretrained Models Preparation
|
33 |
+
2. Audio Data Preparation
|
34 |
+
3. Evaluation
|
35 |
+
|
36 |
+
## 1. Pretrained Models Preparation
|
37 |
+
|
38 |
+
If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
|
39 |
+
|
40 |
+
## 2. Aduio Data Preparation
|
41 |
+
|
42 |
+
Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example.
|
43 |
+
|
44 |
+
```plaintext
|
45 |
+
┣ {ref_dir}
|
46 |
+
┃ ┣ sample1.wav
|
47 |
+
┃ ┣ sample2.wav
|
48 |
+
┣ {gen_dir}
|
49 |
+
┃ ┣ sample1.wav
|
50 |
+
┃ ┣ sample2.wav
|
51 |
+
```
|
52 |
+
|
53 |
+
You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2).
|
54 |
+
|
55 |
+
## 3. Evaluation
|
56 |
+
|
57 |
+
Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics.
|
58 |
+
|
59 |
+
```bash
|
60 |
+
cd Amphion
|
61 |
+
sh egs/metrics/run.sh \
|
62 |
+
--reference_folder [Your path to the reference audios] \
|
63 |
+
--generated_folder [Your path to the generated audios] \
|
64 |
+
--dump_folder [Your path to dump the objective results] \
|
65 |
+
--metrics [The metrics you need] \
|
66 |
+
--fs [Optional. To calculate all metrics in the specified sampling rate]
|
67 |
+
```
|
68 |
+
|
69 |
+
As for the metrics, an example is provided below:
|
70 |
+
|
71 |
+
```bash
|
72 |
+
--metrics "mcd pesq fad"
|
73 |
+
```
|
74 |
+
|
75 |
+
All currently available metrics keywords are listed below:
|
76 |
+
|
77 |
+
| Keys | Description |
|
78 |
+
| --------------------- | ------------------------------------------ |
|
79 |
+
| `fpc` | F0 Pearson Coefficients |
|
80 |
+
| `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error |
|
81 |
+
| `f0rmse` | F0 Root Mean Square Error |
|
82 |
+
| `v_uv_f1` | Voiced/Unvoiced F1 Score |
|
83 |
+
| `energy_rmse` | Energy Root Mean Square Error |
|
84 |
+
| `energy_pc` | Energy Pearson Coefficients |
|
85 |
+
| `cer` | Character Error Rate |
|
86 |
+
| `wer` | Word Error Rate |
|
87 |
+
| `speaker_similarity` | Cos Similarity based on RawNet3 |
|
88 |
+
| `fad` | Frechet Audio Distance |
|
89 |
+
| `mcd` | Mel Cepstral Distortion |
|
90 |
+
| `mstft` | Multi-Resolution STFT Distance |
|
91 |
+
| `pesq` | Perceptual Evaluation of Speech Quality |
|
92 |
+
| `si_sdr` | Scale Invariant Signal to Distortion Ratio |
|
93 |
+
| `si_snr` | Scale Invariant Signal to Noise Ratio |
|
94 |
+
| `stoi` | Short Time Objective Intelligibility |
|
egs/metrics/run.sh
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $exp_dir))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Parse the Given Parameters from the Commond ###########
|
15 |
+
options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics:,fs: -- "$@")
|
16 |
+
eval set -- "$options"
|
17 |
+
|
18 |
+
while true; do
|
19 |
+
case $1 in
|
20 |
+
# Reference Audio Folder
|
21 |
+
--reference_folder) shift; ref_dir=$1 ; shift ;;
|
22 |
+
# Generated Audio Folder
|
23 |
+
--generated_folder) shift; deg_dir=$1 ; shift ;;
|
24 |
+
# Result Dumping Folder
|
25 |
+
--dump_folder) shift; dump_dir=$1 ; shift ;;
|
26 |
+
# Metrics to Compute
|
27 |
+
--metrics) shift; metrics=$1 ; shift ;;
|
28 |
+
# Sampling Rate
|
29 |
+
--fs) shift; fs=$1 ; shift ;;
|
30 |
+
|
31 |
+
--) shift ; break ;;
|
32 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
33 |
+
esac
|
34 |
+
done
|
35 |
+
|
36 |
+
######## Calculate Objective Metrics ###########
|
37 |
+
CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \
|
38 |
+
--ref_dir $ref_dir \
|
39 |
+
--deg_dir $deg_dir \
|
40 |
+
--dump_dir $dump_dir \
|
41 |
+
--metrics $metrics \
|
42 |
+
--fs $fs \
|
egs/svc/DiffComoSVC/README.md
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation
|
2 |
+
<br>
|
3 |
+
<div align="center">
|
4 |
+
<img src="../../../imgs/svc/DiffComoSVC.png" width="90%">
|
5 |
+
</div>
|
6 |
+
<br>
|
7 |
+
|
8 |
+
This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio), only a slightly modification is applied on acoustic model. Specifically,
|
9 |
+
|
10 |
+
* The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/)
|
11 |
+
* To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details.
|
12 |
+
|
13 |
+
There are five stages in total:
|
14 |
+
|
15 |
+
1. Data preparation
|
16 |
+
2. Features extraction
|
17 |
+
3. Teacher Model Training
|
18 |
+
4. Consistency Distillation
|
19 |
+
5. Inference/conversion
|
20 |
+
|
21 |
+
## 1. Data Preparation
|
22 |
+
|
23 |
+
### Dataset Download
|
24 |
+
|
25 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
26 |
+
|
27 |
+
### Configuration
|
28 |
+
|
29 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
30 |
+
|
31 |
+
```json
|
32 |
+
"dataset": [
|
33 |
+
"m4singer",
|
34 |
+
"opencpop",
|
35 |
+
"opensinger",
|
36 |
+
"svcc",
|
37 |
+
"vctk"
|
38 |
+
],
|
39 |
+
"dataset_path": {
|
40 |
+
// TODO: Fill in your dataset path
|
41 |
+
"m4singer": "[M4Singer dataset path]",
|
42 |
+
"opencpop": "[Opencpop dataset path]",
|
43 |
+
"opensinger": "[OpenSinger dataset path]",
|
44 |
+
"svcc": "[SVCC dataset path]",
|
45 |
+
"vctk": "[VCTK dataset path]"
|
46 |
+
},
|
47 |
+
```
|
48 |
+
|
49 |
+
## 2. Features Extraction
|
50 |
+
|
51 |
+
### Content-based Pretrained Models Download
|
52 |
+
|
53 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
54 |
+
|
55 |
+
### Configuration
|
56 |
+
|
57 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
58 |
+
|
59 |
+
```json
|
60 |
+
// TODO: Fill in the output log path
|
61 |
+
"log_dir": "[Your path to save logs and checkpoints]",
|
62 |
+
"preprocess": {
|
63 |
+
// TODO: Fill in the output data path
|
64 |
+
"processed_dir": "[Your path to save processed data]",
|
65 |
+
...
|
66 |
+
},
|
67 |
+
```
|
68 |
+
|
69 |
+
### Run
|
70 |
+
|
71 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
72 |
+
|
73 |
+
```bash
|
74 |
+
cd Amphion
|
75 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 1
|
76 |
+
```
|
77 |
+
|
78 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
79 |
+
|
80 |
+
## 3. Teacher Model Training
|
81 |
+
|
82 |
+
### Configuration
|
83 |
+
|
84 |
+
Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here:
|
85 |
+
|
86 |
+
```JSON
|
87 |
+
"comosvc":{
|
88 |
+
"distill": false,
|
89 |
+
// conformer encoder
|
90 |
+
"input_dim": 384,
|
91 |
+
"output_dim": 100,
|
92 |
+
"n_heads": 2,
|
93 |
+
"n_layers": 6,
|
94 |
+
"filter_channels":512,
|
95 |
+
// karras diffusion
|
96 |
+
"P_mean": -1.2,
|
97 |
+
"P_std": 1.2,
|
98 |
+
"sigma_data": 0.5,
|
99 |
+
"sigma_min": 0.002,
|
100 |
+
"sigma_max": 80,
|
101 |
+
"rho": 7,
|
102 |
+
"n_timesteps": 40,
|
103 |
+
},
|
104 |
+
```
|
105 |
+
|
106 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
107 |
+
|
108 |
+
```json
|
109 |
+
"train": {
|
110 |
+
"batch_size": 32,
|
111 |
+
...
|
112 |
+
"adamw": {
|
113 |
+
"lr": 2.0e-4
|
114 |
+
},
|
115 |
+
...
|
116 |
+
}
|
117 |
+
```
|
118 |
+
|
119 |
+
### Run
|
120 |
+
|
121 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
|
122 |
+
|
123 |
+
```bash
|
124 |
+
cd Amphion
|
125 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
|
126 |
+
```
|
127 |
+
|
128 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
|
129 |
+
|
130 |
+
```bash
|
131 |
+
cd Amphion
|
132 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
|
133 |
+
```
|
134 |
+
|
135 |
+
## 4. Consistency Distillation
|
136 |
+
|
137 |
+
### Configuration
|
138 |
+
|
139 |
+
Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here:
|
140 |
+
|
141 |
+
```JSON
|
142 |
+
"model": {
|
143 |
+
"teacher_model_path":"[Your_teacher_model_checkpoint].bin",
|
144 |
+
...
|
145 |
+
"comosvc":{
|
146 |
+
"distill": true,
|
147 |
+
// conformer encoder
|
148 |
+
"input_dim": 384,
|
149 |
+
"output_dim": 100,
|
150 |
+
"n_heads": 2,
|
151 |
+
"n_layers": 6,
|
152 |
+
"filter_channels":512,
|
153 |
+
// karras diffusion
|
154 |
+
"P_mean": -1.2,
|
155 |
+
"P_std": 1.2,
|
156 |
+
"sigma_data": 0.5,
|
157 |
+
"sigma_min": 0.002,
|
158 |
+
"sigma_max": 80,
|
159 |
+
"rho": 7,
|
160 |
+
"n_timesteps": 40,
|
161 |
+
},
|
162 |
+
```
|
163 |
+
|
164 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
165 |
+
|
166 |
+
```json
|
167 |
+
"train": {
|
168 |
+
"batch_size": 32,
|
169 |
+
...
|
170 |
+
"adamw": {
|
171 |
+
"lr": 2.0e-4
|
172 |
+
},
|
173 |
+
...
|
174 |
+
}
|
175 |
+
```
|
176 |
+
|
177 |
+
### Run
|
178 |
+
|
179 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
|
180 |
+
|
181 |
+
```bash
|
182 |
+
cd Amphion
|
183 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
|
184 |
+
```
|
185 |
+
|
186 |
+
Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
|
187 |
+
|
188 |
+
```bash
|
189 |
+
cd Amphion
|
190 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
|
191 |
+
```
|
192 |
+
|
193 |
+
## 5. Inference/Conversion
|
194 |
+
|
195 |
+
### Pretrained Vocoder Download
|
196 |
+
|
197 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
198 |
+
|
199 |
+
### Run
|
200 |
+
|
201 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
202 |
+
|
203 |
+
| Parameters | Description | Example |
|
204 |
+
| --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
|
205 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
|
206 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
|
207 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
208 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
209 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
210 |
+
|
211 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
212 |
+
|
213 |
+
```bash
|
214 |
+
cd Amphion
|
215 |
+
sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \
|
216 |
+
--infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \
|
217 |
+
--infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \
|
218 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
219 |
+
--infer_target_speaker "opencpop_female1" \
|
220 |
+
--infer_key_shift "autoshift"
|
221 |
+
```
|
222 |
+
Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling):
|
223 |
+
```json
|
224 |
+
"inference": {
|
225 |
+
"comosvc": {
|
226 |
+
"inference_steps": 40
|
227 |
+
}
|
228 |
+
}
|
229 |
+
```
|
230 |
+
|
231 |
+
# Reference
|
232 |
+
https://github.com/zhenye234/CoMoSpeech
|
233 |
+
|
234 |
+
https://github.com/openai/consistency_models
|
egs/svc/DiffComoSVC/exp_config.json
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/comosvc.json",
|
3 |
+
"model_type": "DiffComoSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path
|
20 |
+
"log_dir": "[Your path to save logs and checkpoints]",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path
|
23 |
+
"processed_dir": "[Your path to save processed data]",
|
24 |
+
// Config for features extraction
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": true,
|
27 |
+
"extract_energy": true,
|
28 |
+
"extract_whisper_feature": true,
|
29 |
+
"extract_contentvec_feature": true,
|
30 |
+
"extract_wenet_feature": false,
|
31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
// Fill in the content-based pretrained model's path
|
34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
37 |
+
"whisper_model": "medium",
|
38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
39 |
+
// Config for features usage
|
40 |
+
"use_mel": true,
|
41 |
+
"use_min_max_norm_mel": true,
|
42 |
+
"use_frame_pitch": true,
|
43 |
+
"use_frame_energy": true,
|
44 |
+
"use_spkid": true,
|
45 |
+
"use_whisper": true,
|
46 |
+
"use_contentvec": true,
|
47 |
+
"use_wenet": false,
|
48 |
+
"n_mel": 100,
|
49 |
+
"sample_rate": 24000
|
50 |
+
},
|
51 |
+
"model": {
|
52 |
+
"teacher_model_path":"[Your_teacher_model_checkpoint].bin",
|
53 |
+
"condition_encoder": {
|
54 |
+
// Config for features usage
|
55 |
+
"use_whisper": true,
|
56 |
+
"use_contentvec": true,
|
57 |
+
"use_wenet": false,
|
58 |
+
"whisper_dim": 1024,
|
59 |
+
"contentvec_dim": 256,
|
60 |
+
"wenet_dim": 512,
|
61 |
+
"use_singer_encoder": false,
|
62 |
+
"pitch_min": 50,
|
63 |
+
"pitch_max": 1100
|
64 |
+
},
|
65 |
+
"comosvc":{
|
66 |
+
"distill": false,
|
67 |
+
// conformer encoder
|
68 |
+
"input_dim": 384,
|
69 |
+
"output_dim": 100,
|
70 |
+
"n_heads": 2,
|
71 |
+
"n_layers": 6,
|
72 |
+
"filter_channels":512,
|
73 |
+
"dropout":0.1,
|
74 |
+
// karras diffusion
|
75 |
+
"P_mean": -1.2,
|
76 |
+
"P_std": 1.2,
|
77 |
+
"sigma_data": 0.5,
|
78 |
+
"sigma_min": 0.002,
|
79 |
+
"sigma_max": 80,
|
80 |
+
"rho": 7,
|
81 |
+
"n_timesteps": 40,
|
82 |
+
},
|
83 |
+
"diffusion": {
|
84 |
+
// Diffusion steps encoder
|
85 |
+
"step_encoder": {
|
86 |
+
"dim_raw_embedding": 128,
|
87 |
+
"dim_hidden_layer": 512,
|
88 |
+
"activation": "SiLU",
|
89 |
+
"num_layer": 2,
|
90 |
+
"max_period": 10000
|
91 |
+
},
|
92 |
+
// Diffusion decoder
|
93 |
+
"model_type": "bidilconv",
|
94 |
+
// bidilconv, unet2d, TODO: unet1d
|
95 |
+
"bidilconv": {
|
96 |
+
"base_channel": 384,
|
97 |
+
"n_res_block": 20,
|
98 |
+
"conv_kernel_size": 3,
|
99 |
+
"dilation_cycle_length": 4,
|
100 |
+
// specially, 1 means no dilation
|
101 |
+
"conditioner_size": 100
|
102 |
+
}
|
103 |
+
}
|
104 |
+
},
|
105 |
+
"train": {
|
106 |
+
"batch_size": 64,
|
107 |
+
"gradient_accumulation_step": 1,
|
108 |
+
"max_epoch": -1, // -1 means no limit
|
109 |
+
"save_checkpoint_stride": [
|
110 |
+
50,
|
111 |
+
50
|
112 |
+
],
|
113 |
+
"keep_last": [
|
114 |
+
5,
|
115 |
+
-1
|
116 |
+
],
|
117 |
+
"run_eval": [
|
118 |
+
false,
|
119 |
+
true
|
120 |
+
],
|
121 |
+
"adamw": {
|
122 |
+
"lr": 4.0e-4
|
123 |
+
},
|
124 |
+
"reducelronplateau": {
|
125 |
+
"factor": 0.8,
|
126 |
+
"patience": 10,
|
127 |
+
"min_lr": 1.0e-4
|
128 |
+
},
|
129 |
+
"dataloader": {
|
130 |
+
"num_worker": 8,
|
131 |
+
"pin_memory": true
|
132 |
+
},
|
133 |
+
"sampler": {
|
134 |
+
"holistic_shuffle": false,
|
135 |
+
"drop_last": true
|
136 |
+
}
|
137 |
+
},
|
138 |
+
"inference": {
|
139 |
+
"comosvc": {
|
140 |
+
"inference_steps": 40
|
141 |
+
}
|
142 |
+
}
|
143 |
+
}
|
egs/svc/DiffComoSVC/run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../_template/run.sh
|
egs/svc/MultipleContentsSVC/README.md
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
|
2 |
+
|
3 |
+
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
|
4 |
+
[![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
|
5 |
+
|
6 |
+
<br>
|
7 |
+
<div align="center">
|
8 |
+
<img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
|
9 |
+
</div>
|
10 |
+
<br>
|
11 |
+
|
12 |
+
This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
|
13 |
+
|
14 |
+
- The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
|
15 |
+
- The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
|
16 |
+
- The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
|
17 |
+
|
18 |
+
There are four stages in total:
|
19 |
+
|
20 |
+
1. Data preparation
|
21 |
+
2. Features extraction
|
22 |
+
3. Training
|
23 |
+
4. Inference/conversion
|
24 |
+
|
25 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
26 |
+
> ```bash
|
27 |
+
> cd Amphion
|
28 |
+
> ```
|
29 |
+
|
30 |
+
## 1. Data Preparation
|
31 |
+
|
32 |
+
### Dataset Download
|
33 |
+
|
34 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
35 |
+
|
36 |
+
### Configuration
|
37 |
+
|
38 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
39 |
+
|
40 |
+
```json
|
41 |
+
"dataset": [
|
42 |
+
"m4singer",
|
43 |
+
"opencpop",
|
44 |
+
"opensinger",
|
45 |
+
"svcc",
|
46 |
+
"vctk"
|
47 |
+
],
|
48 |
+
"dataset_path": {
|
49 |
+
// TODO: Fill in your dataset path
|
50 |
+
"m4singer": "[M4Singer dataset path]",
|
51 |
+
"opencpop": "[Opencpop dataset path]",
|
52 |
+
"opensinger": "[OpenSinger dataset path]",
|
53 |
+
"svcc": "[SVCC dataset path]",
|
54 |
+
"vctk": "[VCTK dataset path]"
|
55 |
+
},
|
56 |
+
```
|
57 |
+
|
58 |
+
## 2. Features Extraction
|
59 |
+
|
60 |
+
### Content-based Pretrained Models Download
|
61 |
+
|
62 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
63 |
+
|
64 |
+
### Configuration
|
65 |
+
|
66 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
67 |
+
|
68 |
+
```json
|
69 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
70 |
+
"log_dir": "ckpts/svc",
|
71 |
+
"preprocess": {
|
72 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
73 |
+
"processed_dir": "data",
|
74 |
+
...
|
75 |
+
},
|
76 |
+
```
|
77 |
+
|
78 |
+
### Run
|
79 |
+
|
80 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
81 |
+
|
82 |
+
```bash
|
83 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 1
|
84 |
+
```
|
85 |
+
|
86 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
87 |
+
|
88 |
+
## 3. Training
|
89 |
+
|
90 |
+
### Configuration
|
91 |
+
|
92 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
93 |
+
|
94 |
+
```json
|
95 |
+
"train": {
|
96 |
+
"batch_size": 32,
|
97 |
+
...
|
98 |
+
"adamw": {
|
99 |
+
"lr": 2.0e-4
|
100 |
+
},
|
101 |
+
...
|
102 |
+
}
|
103 |
+
```
|
104 |
+
|
105 |
+
### Run
|
106 |
+
|
107 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
108 |
+
|
109 |
+
```bash
|
110 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
|
111 |
+
```
|
112 |
+
|
113 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
114 |
+
|
115 |
+
## 4. Inference/Conversion
|
116 |
+
|
117 |
+
### Pretrained Vocoder Download
|
118 |
+
|
119 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
120 |
+
|
121 |
+
### Run
|
122 |
+
|
123 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
124 |
+
|
125 |
+
| Parameters | Description | Example |
|
126 |
+
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
127 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
|
128 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
|
129 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
130 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
131 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
132 |
+
|
133 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
134 |
+
|
135 |
+
```bash
|
136 |
+
sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
|
137 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
138 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
139 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
140 |
+
--infer_target_speaker "opencpop_female1" \
|
141 |
+
--infer_key_shift "autoshift"
|
142 |
+
```
|
143 |
+
|
144 |
+
## Citations
|
145 |
+
|
146 |
+
```bibtex
|
147 |
+
@article{zhang2023leveraging,
|
148 |
+
title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
|
149 |
+
author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
|
150 |
+
journal={Machine Learning for Audio Worshop, NeurIPS 2023},
|
151 |
+
year={2023}
|
152 |
+
}
|
153 |
+
```
|
egs/svc/MultipleContentsSVC/exp_config.json
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/diffusion.json",
|
3 |
+
"model_type": "DiffWaveNetSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
20 |
+
"log_dir": "ckpts/svc",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
23 |
+
"processed_dir": "data",
|
24 |
+
// Config for features extraction
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": true,
|
27 |
+
"extract_energy": true,
|
28 |
+
"extract_whisper_feature": true,
|
29 |
+
"extract_contentvec_feature": true,
|
30 |
+
"extract_wenet_feature": false,
|
31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
// Fill in the content-based pretrained model's path
|
34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
37 |
+
"whisper_model": "medium",
|
38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
39 |
+
// Config for features usage
|
40 |
+
"use_mel": true,
|
41 |
+
"use_min_max_norm_mel": true,
|
42 |
+
"use_frame_pitch": true,
|
43 |
+
"use_frame_energy": true,
|
44 |
+
"use_spkid": true,
|
45 |
+
"use_whisper": true,
|
46 |
+
"use_contentvec": true,
|
47 |
+
"use_wenet": false,
|
48 |
+
"n_mel": 100,
|
49 |
+
"sample_rate": 24000
|
50 |
+
},
|
51 |
+
"model": {
|
52 |
+
"condition_encoder": {
|
53 |
+
// Config for features usage
|
54 |
+
"use_whisper": true,
|
55 |
+
"use_contentvec": true,
|
56 |
+
"use_wenet": false,
|
57 |
+
"whisper_dim": 1024,
|
58 |
+
"contentvec_dim": 256,
|
59 |
+
"wenet_dim": 512,
|
60 |
+
"use_singer_encoder": false,
|
61 |
+
"pitch_min": 50,
|
62 |
+
"pitch_max": 1100
|
63 |
+
},
|
64 |
+
"diffusion": {
|
65 |
+
"scheduler": "ddpm",
|
66 |
+
"scheduler_settings": {
|
67 |
+
"num_train_timesteps": 1000,
|
68 |
+
"beta_start": 1.0e-4,
|
69 |
+
"beta_end": 0.02,
|
70 |
+
"beta_schedule": "linear"
|
71 |
+
},
|
72 |
+
// Diffusion steps encoder
|
73 |
+
"step_encoder": {
|
74 |
+
"dim_raw_embedding": 128,
|
75 |
+
"dim_hidden_layer": 512,
|
76 |
+
"activation": "SiLU",
|
77 |
+
"num_layer": 2,
|
78 |
+
"max_period": 10000
|
79 |
+
},
|
80 |
+
// Diffusion decoder
|
81 |
+
"model_type": "bidilconv",
|
82 |
+
// bidilconv, unet2d, TODO: unet1d
|
83 |
+
"bidilconv": {
|
84 |
+
"base_channel": 512,
|
85 |
+
"n_res_block": 40,
|
86 |
+
"conv_kernel_size": 3,
|
87 |
+
"dilation_cycle_length": 4,
|
88 |
+
// specially, 1 means no dilation
|
89 |
+
"conditioner_size": 384
|
90 |
+
}
|
91 |
+
}
|
92 |
+
},
|
93 |
+
"train": {
|
94 |
+
"batch_size": 32,
|
95 |
+
"gradient_accumulation_step": 1,
|
96 |
+
"max_epoch": -1, // -1 means no limit
|
97 |
+
"save_checkpoint_stride": [
|
98 |
+
3,
|
99 |
+
50
|
100 |
+
],
|
101 |
+
"keep_last": [
|
102 |
+
3,
|
103 |
+
2
|
104 |
+
],
|
105 |
+
"run_eval": [
|
106 |
+
true,
|
107 |
+
true
|
108 |
+
],
|
109 |
+
"adamw": {
|
110 |
+
"lr": 2.0e-4
|
111 |
+
},
|
112 |
+
"reducelronplateau": {
|
113 |
+
"factor": 0.8,
|
114 |
+
"patience": 30,
|
115 |
+
"min_lr": 1.0e-4
|
116 |
+
},
|
117 |
+
"dataloader": {
|
118 |
+
"num_worker": 8,
|
119 |
+
"pin_memory": true
|
120 |
+
},
|
121 |
+
"sampler": {
|
122 |
+
"holistic_shuffle": false,
|
123 |
+
"drop_last": true
|
124 |
+
}
|
125 |
+
}
|
126 |
+
}
|
egs/svc/MultipleContentsSVC/run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../_template/run.sh
|
egs/svc/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Amphion Singing Voice Conversion (SVC) Recipe
|
2 |
+
|
3 |
+
## Quick Start
|
4 |
+
|
5 |
+
We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
|
6 |
+
|
7 |
+
## Supported Model Architectures
|
8 |
+
|
9 |
+
The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
|
10 |
+
|
11 |
+
<br>
|
12 |
+
<div align="center">
|
13 |
+
<img src="../../imgs/svc/pipeline.png" width="70%">
|
14 |
+
</div>
|
15 |
+
<br>
|
16 |
+
|
17 |
+
Until now, Amphion SVC has supported the following features and models:
|
18 |
+
|
19 |
+
- **Speaker-agnostic Representations**:
|
20 |
+
- Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
|
21 |
+
- Prosody Features: F0 and energy.
|
22 |
+
- **Speaker Embeddings**:
|
23 |
+
- Speaker Look-Up Table.
|
24 |
+
- Reference Encoder (👨💻 developing): It can be used for zero-shot SVC.
|
25 |
+
- **Acoustic Decoders**:
|
26 |
+
- Diffusion-based models:
|
27 |
+
- **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
|
28 |
+
- **[DiffComoSVC](DiffComoSVC)** (👨💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
|
29 |
+
- Transformer-based models:
|
30 |
+
- **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
|
31 |
+
- VAE- and Flow-based models:
|
32 |
+
- **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
|
33 |
+
- **Waveform Synthesizers (Vocoders)**:
|
34 |
+
- The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
|
egs/svc/TransformerSVC/README.md
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Transformer for Singing Voice Conversion
|
2 |
+
|
3 |
+
This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion.
|
4 |
+
|
5 |
+
There are four stages in total:
|
6 |
+
|
7 |
+
1. Data preparation
|
8 |
+
2. Features extraction
|
9 |
+
3. Training
|
10 |
+
4. Inference/conversion
|
11 |
+
|
12 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
13 |
+
> ```bash
|
14 |
+
> cd Amphion
|
15 |
+
> ```
|
16 |
+
|
17 |
+
## 1. Data Preparation
|
18 |
+
|
19 |
+
### Dataset Download
|
20 |
+
|
21 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"m4singer",
|
30 |
+
"opencpop",
|
31 |
+
"opensinger",
|
32 |
+
"svcc",
|
33 |
+
"vctk"
|
34 |
+
],
|
35 |
+
"dataset_path": {
|
36 |
+
// TODO: Fill in your dataset path
|
37 |
+
"m4singer": "[M4Singer dataset path]",
|
38 |
+
"opencpop": "[Opencpop dataset path]",
|
39 |
+
"opensinger": "[OpenSinger dataset path]",
|
40 |
+
"svcc": "[SVCC dataset path]",
|
41 |
+
"vctk": "[VCTK dataset path]"
|
42 |
+
},
|
43 |
+
```
|
44 |
+
|
45 |
+
## 2. Features Extraction
|
46 |
+
|
47 |
+
### Content-based Pretrained Models Download
|
48 |
+
|
49 |
+
By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
50 |
+
|
51 |
+
### Configuration
|
52 |
+
|
53 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
54 |
+
|
55 |
+
```json
|
56 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
57 |
+
"log_dir": "ckpts/svc",
|
58 |
+
"preprocess": {
|
59 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
60 |
+
"processed_dir": "data",
|
61 |
+
...
|
62 |
+
},
|
63 |
+
```
|
64 |
+
|
65 |
+
### Run
|
66 |
+
|
67 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
68 |
+
|
69 |
+
```bash
|
70 |
+
sh egs/svc/TransformerSVC/run.sh --stage 1
|
71 |
+
```
|
72 |
+
|
73 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
74 |
+
|
75 |
+
## 3. Training
|
76 |
+
|
77 |
+
### Configuration
|
78 |
+
Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
|
79 |
+
```json
|
80 |
+
"model": {
|
81 |
+
...
|
82 |
+
"transformer":{
|
83 |
+
// 'conformer' or 'transformer'
|
84 |
+
"type": "conformer",
|
85 |
+
"input_dim": 384,
|
86 |
+
"output_dim": 100,
|
87 |
+
"n_heads": 2,
|
88 |
+
"n_layers": 6,
|
89 |
+
"filter_channels":512,
|
90 |
+
"dropout":0.1,
|
91 |
+
}
|
92 |
+
}
|
93 |
+
```
|
94 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
95 |
+
|
96 |
+
```json
|
97 |
+
"train": {
|
98 |
+
"batch_size": 32,
|
99 |
+
...
|
100 |
+
"adamw": {
|
101 |
+
"lr": 2.0e-4
|
102 |
+
},
|
103 |
+
...
|
104 |
+
}
|
105 |
+
```
|
106 |
+
|
107 |
+
### Run
|
108 |
+
|
109 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
110 |
+
|
111 |
+
```bash
|
112 |
+
sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName]
|
113 |
+
```
|
114 |
+
|
115 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
116 |
+
|
117 |
+
## 4. Inference/Conversion
|
118 |
+
|
119 |
+
### Pretrained Vocoder Download
|
120 |
+
|
121 |
+
We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
|
122 |
+
|
123 |
+
### Run
|
124 |
+
|
125 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
126 |
+
|
127 |
+
| Parameters | Description | Example |
|
128 |
+
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
129 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
|
130 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
|
131 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
132 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
133 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
134 |
+
|
135 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
136 |
+
|
137 |
+
```bash
|
138 |
+
cd Amphion
|
139 |
+
sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \
|
140 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
141 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
142 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
143 |
+
--infer_target_speaker "opencpop_female1" \
|
144 |
+
--infer_key_shift "autoshift"
|
145 |
+
```
|
146 |
+
|
147 |
+
## Citations
|
148 |
+
|
149 |
+
```bibtex
|
150 |
+
@inproceedings{transformer,
|
151 |
+
author = {Ashish Vaswani and
|
152 |
+
Noam Shazeer and
|
153 |
+
Niki Parmar and
|
154 |
+
Jakob Uszkoreit and
|
155 |
+
Llion Jones and
|
156 |
+
Aidan N. Gomez and
|
157 |
+
Lukasz Kaiser and
|
158 |
+
Illia Polosukhin},
|
159 |
+
title = {Attention is All you Need},
|
160 |
+
booktitle = {{NIPS}},
|
161 |
+
pages = {5998--6008},
|
162 |
+
year = {2017}
|
163 |
+
}
|
164 |
+
```
|
egs/svc/TransformerSVC/exp_config.json
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/transformer.json",
|
3 |
+
"model_type": "TransformerSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
20 |
+
"log_dir": "ckpts/svc",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
23 |
+
"processed_dir": "data",
|
24 |
+
// Config for features extraction
|
25 |
+
"extract_mel": true,
|
26 |
+
"extract_pitch": true,
|
27 |
+
"extract_energy": true,
|
28 |
+
"extract_whisper_feature": true,
|
29 |
+
"extract_contentvec_feature": true,
|
30 |
+
"extract_wenet_feature": false,
|
31 |
+
"whisper_batch_size": 30, // decrease it if your GPU is out of memory
|
32 |
+
"contentvec_batch_size": 1,
|
33 |
+
// Fill in the content-based pretrained model's path
|
34 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
35 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
36 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
37 |
+
"whisper_model": "medium",
|
38 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
39 |
+
// Config for features usage
|
40 |
+
"use_mel": true,
|
41 |
+
"use_min_max_norm_mel": true,
|
42 |
+
"use_frame_pitch": true,
|
43 |
+
"use_frame_energy": true,
|
44 |
+
"use_spkid": true,
|
45 |
+
"use_whisper": true,
|
46 |
+
"use_contentvec": true,
|
47 |
+
"use_wenet": false,
|
48 |
+
"n_mel": 100,
|
49 |
+
"sample_rate": 24000
|
50 |
+
},
|
51 |
+
"model": {
|
52 |
+
"condition_encoder": {
|
53 |
+
// Config for features usage
|
54 |
+
"use_whisper": true,
|
55 |
+
"use_contentvec": true,
|
56 |
+
"use_wenet": false,
|
57 |
+
"whisper_dim": 1024,
|
58 |
+
"contentvec_dim": 256,
|
59 |
+
"wenet_dim": 512,
|
60 |
+
"use_singer_encoder": false,
|
61 |
+
"pitch_min": 50,
|
62 |
+
"pitch_max": 1100
|
63 |
+
},
|
64 |
+
"transformer": {
|
65 |
+
// 'conformer' or 'transformer'
|
66 |
+
"type": "conformer",
|
67 |
+
"input_dim": 384,
|
68 |
+
"output_dim": 100,
|
69 |
+
"n_heads": 2,
|
70 |
+
"n_layers": 6,
|
71 |
+
"filter_channels": 512,
|
72 |
+
"dropout": 0.1,
|
73 |
+
}
|
74 |
+
},
|
75 |
+
"train": {
|
76 |
+
"batch_size": 64,
|
77 |
+
"gradient_accumulation_step": 1,
|
78 |
+
"max_epoch": -1, // -1 means no limit
|
79 |
+
"save_checkpoint_stride": [
|
80 |
+
50,
|
81 |
+
50
|
82 |
+
],
|
83 |
+
"keep_last": [
|
84 |
+
5,
|
85 |
+
-1
|
86 |
+
],
|
87 |
+
"run_eval": [
|
88 |
+
false,
|
89 |
+
true
|
90 |
+
],
|
91 |
+
"adamw": {
|
92 |
+
"lr": 4.0e-4
|
93 |
+
},
|
94 |
+
"reducelronplateau": {
|
95 |
+
"factor": 0.8,
|
96 |
+
"patience": 10,
|
97 |
+
"min_lr": 1.0e-4
|
98 |
+
},
|
99 |
+
"dataloader": {
|
100 |
+
"num_worker": 8,
|
101 |
+
"pin_memory": true
|
102 |
+
},
|
103 |
+
"sampler": {
|
104 |
+
"holistic_shuffle": false,
|
105 |
+
"drop_last": true
|
106 |
+
}
|
107 |
+
}
|
108 |
+
}
|
egs/svc/TransformerSVC/run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../_template/run.sh
|
egs/svc/VitsSVC/README.md
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# VITS for Singing Voice Conversion
|
2 |
+
|
3 |
+
This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.
|
4 |
+
|
5 |
+
There are four stages in total:
|
6 |
+
|
7 |
+
1. Data preparation
|
8 |
+
2. Features extraction
|
9 |
+
3. Training
|
10 |
+
4. Inference/conversion
|
11 |
+
|
12 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
13 |
+
> ```bash
|
14 |
+
> cd Amphion
|
15 |
+
> ```
|
16 |
+
|
17 |
+
## 1. Data Preparation
|
18 |
+
|
19 |
+
### Dataset Download
|
20 |
+
|
21 |
+
By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"m4singer",
|
30 |
+
"opencpop",
|
31 |
+
"opensinger",
|
32 |
+
"svcc",
|
33 |
+
"vctk"
|
34 |
+
],
|
35 |
+
"dataset_path": {
|
36 |
+
// TODO: Fill in your dataset path
|
37 |
+
"m4singer": "[M4Singer dataset path]",
|
38 |
+
"opencpop": "[Opencpop dataset path]",
|
39 |
+
"opensinger": "[OpenSinger dataset path]",
|
40 |
+
"svcc": "[SVCC dataset path]",
|
41 |
+
"vctk": "[VCTK dataset path]"
|
42 |
+
},
|
43 |
+
```
|
44 |
+
|
45 |
+
## 2. Features Extraction
|
46 |
+
|
47 |
+
### Content-based Pretrained Models Download
|
48 |
+
|
49 |
+
By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
|
50 |
+
|
51 |
+
### Configuration
|
52 |
+
|
53 |
+
Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
|
54 |
+
|
55 |
+
```json
|
56 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
57 |
+
"log_dir": "ckpts/svc",
|
58 |
+
"preprocess": {
|
59 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
60 |
+
"processed_dir": "data",
|
61 |
+
...
|
62 |
+
},
|
63 |
+
```
|
64 |
+
|
65 |
+
### Run
|
66 |
+
|
67 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`).
|
68 |
+
|
69 |
+
```bash
|
70 |
+
sh egs/svc/VitsSVC/run.sh --stage 1
|
71 |
+
```
|
72 |
+
|
73 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
|
74 |
+
|
75 |
+
## 3. Training
|
76 |
+
|
77 |
+
### Configuration
|
78 |
+
|
79 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
|
80 |
+
|
81 |
+
```json
|
82 |
+
"train": {
|
83 |
+
"batch_size": 32,
|
84 |
+
...
|
85 |
+
"adamw": {
|
86 |
+
"lr": 2.0e-4
|
87 |
+
},
|
88 |
+
...
|
89 |
+
}
|
90 |
+
```
|
91 |
+
|
92 |
+
### Run
|
93 |
+
|
94 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
|
95 |
+
|
96 |
+
```bash
|
97 |
+
sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName]
|
98 |
+
```
|
99 |
+
|
100 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
101 |
+
|
102 |
+
## 4. Inference/Conversion
|
103 |
+
|
104 |
+
### Run
|
105 |
+
|
106 |
+
For inference/conversion, you need to specify the following configurations when running `run.sh`:
|
107 |
+
|
108 |
+
| Parameters | Description | Example |
|
109 |
+
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
110 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
|
111 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
|
112 |
+
| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
|
113 |
+
| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
|
114 |
+
| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
|
115 |
+
|
116 |
+
For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
|
117 |
+
|
118 |
+
```bash
|
119 |
+
sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \
|
120 |
+
--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
|
121 |
+
--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
|
122 |
+
--infer_source_audio_dir [Your Audios Folder] \
|
123 |
+
--infer_target_speaker "opencpop_female1" \
|
124 |
+
--infer_key_shift "autoshift"
|
125 |
+
```
|
egs/svc/VitsSVC/exp_config.json
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/vitssvc.json",
|
3 |
+
"model_type": "VitsSVC",
|
4 |
+
"dataset": [
|
5 |
+
"m4singer",
|
6 |
+
"opencpop",
|
7 |
+
"opensinger",
|
8 |
+
"svcc",
|
9 |
+
"vctk"
|
10 |
+
],
|
11 |
+
"dataset_path": {
|
12 |
+
// TODO: Fill in your dataset path
|
13 |
+
"m4singer": "[M4Singer dataset path]",
|
14 |
+
"opencpop": "[Opencpop dataset path]",
|
15 |
+
"opensinger": "[OpenSinger dataset path]",
|
16 |
+
"svcc": "[SVCC dataset path]",
|
17 |
+
"vctk": "[VCTK dataset path]"
|
18 |
+
},
|
19 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
|
20 |
+
"log_dir": "ckpts/svc",
|
21 |
+
"preprocess": {
|
22 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
23 |
+
"processed_dir": "data",
|
24 |
+
|
25 |
+
"f0_min": 50,
|
26 |
+
"f0_max": 1100,
|
27 |
+
// f0_bin in sovits
|
28 |
+
"pitch_bin": 256,
|
29 |
+
// filter_length in sovits
|
30 |
+
"n_fft": 2048,
|
31 |
+
// hop_length in sovits
|
32 |
+
"hop_size": 512,
|
33 |
+
// win_length in sovits
|
34 |
+
"win_size": 2048,
|
35 |
+
"segment_size": 8192,
|
36 |
+
"n_mel": 100,
|
37 |
+
"sample_rate": 44100,
|
38 |
+
|
39 |
+
// Config for features extraction
|
40 |
+
"extract_mel": true,
|
41 |
+
"extract_pitch": true,
|
42 |
+
"pitch_extractor": "parselmouth",
|
43 |
+
"extract_energy": false,
|
44 |
+
"extract_uv": true,
|
45 |
+
"extract_linear_spec": true,
|
46 |
+
"extract_audio": true,
|
47 |
+
// contentvec
|
48 |
+
"extract_contentvec_feature": true,
|
49 |
+
"contentvec_sample_rate": 16000,
|
50 |
+
"contentvec_batch_size": 1,
|
51 |
+
"contentvec_frameshift": 0.02,
|
52 |
+
// whisper
|
53 |
+
"extract_whisper_feature": true,
|
54 |
+
"whisper_sample_rate": 16000,
|
55 |
+
"whisper_frameshift": 0.01,
|
56 |
+
"whisper_downsample_rate": 2,
|
57 |
+
// Fill in the content-based pretrained model's path
|
58 |
+
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
|
59 |
+
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
|
60 |
+
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
|
61 |
+
"whisper_model": "medium",
|
62 |
+
"whisper_model_path": "pretrained/whisper/medium.pt",
|
63 |
+
// Config for features usage
|
64 |
+
"use_mel": true,
|
65 |
+
"use_frame_pitch": true,
|
66 |
+
"use_uv": true,
|
67 |
+
"use_spkid": true,
|
68 |
+
"use_contentvec": true,
|
69 |
+
"use_whisper": true,
|
70 |
+
"use_text": false,
|
71 |
+
"use_phone": false,
|
72 |
+
|
73 |
+
// Extract content features using dataloader
|
74 |
+
"pin_memory": true,
|
75 |
+
"num_workers": 8,
|
76 |
+
"content_feature_batch_size": 16,
|
77 |
+
// Meta file
|
78 |
+
"train_file": "train.json",
|
79 |
+
"valid_file": "test.json",
|
80 |
+
"spk2id": "singers.json",
|
81 |
+
"utt2spk": "utt2singer"
|
82 |
+
},
|
83 |
+
"model": {
|
84 |
+
"condition_encoder": {
|
85 |
+
// Config for features usage
|
86 |
+
"merge_mode": "add",
|
87 |
+
"input_melody_dim": 1,
|
88 |
+
"use_log_f0": true,
|
89 |
+
"n_bins_melody": 256,
|
90 |
+
//# Quantization (0 for not quantization)
|
91 |
+
"output_melody_dim": 192,
|
92 |
+
|
93 |
+
"use_contentvec": true,
|
94 |
+
"use_whisper": true,
|
95 |
+
"use_mert": false,
|
96 |
+
"use_wenet": false,
|
97 |
+
"whisper_dim": 1024,
|
98 |
+
"contentvec_dim": 256,
|
99 |
+
"content_encoder_dim": 192,
|
100 |
+
"output_singer_dim": 192,
|
101 |
+
"singer_table_size": 512,
|
102 |
+
"output_content_dim": 192,
|
103 |
+
"use_spkid": true,
|
104 |
+
|
105 |
+
"pitch_max": 1100.0,
|
106 |
+
"pitch_min": 50.0,
|
107 |
+
},
|
108 |
+
"vits": {
|
109 |
+
"inter_channels": 192,
|
110 |
+
"hidden_channels": 192,
|
111 |
+
"filter_channels": 256,
|
112 |
+
"n_heads": 2,
|
113 |
+
"n_layers": 6,
|
114 |
+
"kernel_size": 3,
|
115 |
+
"p_dropout": 0.1,
|
116 |
+
"ssl_dim": 256,
|
117 |
+
"n_flow_layer": 4,
|
118 |
+
"n_layers_q": 3,
|
119 |
+
"gin_channels": 256,
|
120 |
+
"n_speakers": 512,
|
121 |
+
"use_spectral_norm": false,
|
122 |
+
},
|
123 |
+
"generator": "nsfhifigan",
|
124 |
+
},
|
125 |
+
"train": {
|
126 |
+
"batch_size": 32,
|
127 |
+
"learning_rate": 2e-4,
|
128 |
+
"gradient_accumulation_step": 1,
|
129 |
+
"max_epoch": -1, // -1 means no limit
|
130 |
+
"save_checkpoint_stride": [
|
131 |
+
3,
|
132 |
+
50
|
133 |
+
],
|
134 |
+
"keep_last": [
|
135 |
+
3,
|
136 |
+
2
|
137 |
+
],
|
138 |
+
"run_eval": [
|
139 |
+
true,
|
140 |
+
true
|
141 |
+
],
|
142 |
+
"adamw": {
|
143 |
+
"lr": 2.0e-4
|
144 |
+
},
|
145 |
+
"reducelronplateau": {
|
146 |
+
"factor": 0.8,
|
147 |
+
"patience": 30,
|
148 |
+
"min_lr": 1.0e-4
|
149 |
+
},
|
150 |
+
"dataloader": {
|
151 |
+
"num_worker": 8,
|
152 |
+
"pin_memory": true
|
153 |
+
},
|
154 |
+
"sampler": {
|
155 |
+
"holistic_shuffle": false,
|
156 |
+
"drop_last": true
|
157 |
+
}
|
158 |
+
},
|
159 |
+
"inference": {
|
160 |
+
"batch_size": 1,
|
161 |
+
}
|
162 |
+
}
|
egs/svc/VitsSVC/run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../_template/run.sh
|
egs/svc/_template/run.sh
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Parse the Given Parameters from the Commond ###########
|
15 |
+
options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
|
16 |
+
eval set -- "$options"
|
17 |
+
|
18 |
+
while true; do
|
19 |
+
case $1 in
|
20 |
+
# Experimental Configuration File
|
21 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
22 |
+
# Experimental Name
|
23 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
24 |
+
# Running Stage
|
25 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
26 |
+
# Visible GPU machines. The default value is "0".
|
27 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
28 |
+
|
29 |
+
# [Only for Training] Resume configuration
|
30 |
+
--resume) shift; resume=$1 ; shift ;;
|
31 |
+
# [Only for Training] The specific checkpoint path that you want to resume from.
|
32 |
+
--resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
|
33 |
+
# [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
|
34 |
+
--resume_type) shift; resume_type=$1 ; shift ;;
|
35 |
+
|
36 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
37 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
38 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
39 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
40 |
+
# [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
|
41 |
+
--infer_source_file) shift; infer_source_file=$1 ; shift ;;
|
42 |
+
--infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
|
43 |
+
# [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
|
44 |
+
--infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
|
45 |
+
# [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
|
46 |
+
--infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
|
47 |
+
# [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
|
48 |
+
--infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
|
49 |
+
|
50 |
+
--) shift ; break ;;
|
51 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
52 |
+
esac
|
53 |
+
done
|
54 |
+
|
55 |
+
|
56 |
+
### Value check ###
|
57 |
+
if [ -z "$running_stage" ]; then
|
58 |
+
echo "[Error] Please specify the running stage"
|
59 |
+
exit 1
|
60 |
+
fi
|
61 |
+
|
62 |
+
if [ -z "$exp_config" ]; then
|
63 |
+
exp_config="${exp_dir}"/exp_config.json
|
64 |
+
fi
|
65 |
+
echo "Exprimental Configuration File: $exp_config"
|
66 |
+
|
67 |
+
if [ -z "$gpu" ]; then
|
68 |
+
gpu="0"
|
69 |
+
fi
|
70 |
+
|
71 |
+
######## Features Extraction ###########
|
72 |
+
if [ $running_stage -eq 1 ]; then
|
73 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
|
74 |
+
--config $exp_config \
|
75 |
+
--num_workers 4
|
76 |
+
fi
|
77 |
+
|
78 |
+
######## Training ###########
|
79 |
+
if [ $running_stage -eq 2 ]; then
|
80 |
+
if [ -z "$exp_name" ]; then
|
81 |
+
echo "[Error] Please specify the experiments name"
|
82 |
+
exit 1
|
83 |
+
fi
|
84 |
+
echo "Exprimental Name: $exp_name"
|
85 |
+
|
86 |
+
if [ "$resume" = true ]; then
|
87 |
+
echo "Automatically resume from the experimental dir..."
|
88 |
+
CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
|
89 |
+
--config "$exp_config" \
|
90 |
+
--exp_name "$exp_name" \
|
91 |
+
--log_level info \
|
92 |
+
--resume
|
93 |
+
else
|
94 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
|
95 |
+
--config "$exp_config" \
|
96 |
+
--exp_name "$exp_name" \
|
97 |
+
--log_level info \
|
98 |
+
--resume_from_ckpt_path "$resume_from_ckpt_path" \
|
99 |
+
--resume_type "$resume_type"
|
100 |
+
fi
|
101 |
+
fi
|
102 |
+
|
103 |
+
######## Inference/Conversion ###########
|
104 |
+
if [ $running_stage -eq 3 ]; then
|
105 |
+
if [ -z "$infer_expt_dir" ]; then
|
106 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
107 |
+
exit 1
|
108 |
+
fi
|
109 |
+
|
110 |
+
if [ -z "$infer_output_dir" ]; then
|
111 |
+
infer_output_dir="$expt_dir/result"
|
112 |
+
fi
|
113 |
+
|
114 |
+
if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
|
115 |
+
echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
|
116 |
+
exit 1
|
117 |
+
fi
|
118 |
+
|
119 |
+
if [ -z "$infer_source_file" ]; then
|
120 |
+
infer_source=$infer_source_audio_dir
|
121 |
+
fi
|
122 |
+
|
123 |
+
if [ -z "$infer_source_audio_dir" ]; then
|
124 |
+
infer_source=$infer_source_file
|
125 |
+
fi
|
126 |
+
|
127 |
+
if [ -z "$infer_target_speaker" ]; then
|
128 |
+
echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
|
129 |
+
exit 1
|
130 |
+
fi
|
131 |
+
|
132 |
+
if [ -z "$infer_key_shift" ]; then
|
133 |
+
infer_key_shift="autoshift"
|
134 |
+
fi
|
135 |
+
|
136 |
+
if [ -z "$infer_vocoder_dir" ]; then
|
137 |
+
infer_vocoder_dir="$work_dir"/pretrained/bigvgan
|
138 |
+
echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
|
139 |
+
fi
|
140 |
+
|
141 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
|
142 |
+
--config $exp_config \
|
143 |
+
--acoustics_dir $infer_expt_dir \
|
144 |
+
--vocoder_dir $infer_vocoder_dir \
|
145 |
+
--target_singer $infer_target_speaker \
|
146 |
+
--trans_key $infer_key_shift \
|
147 |
+
--source $infer_source \
|
148 |
+
--output_dir $infer_output_dir \
|
149 |
+
--log_level debug
|
150 |
+
fi
|
egs/tta/README.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Amphion Text-to-Audio (TTA) Recipe
|
2 |
+
|
3 |
+
## Quick Start
|
4 |
+
|
5 |
+
We provide a **[beginner recipe](RECIPE.md)** to demonstrate how to train a cutting edge TTA model. Specifically, it is designed as a latent diffusion model like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830).
|
6 |
+
|
7 |
+
## Supported Model Architectures
|
8 |
+
|
9 |
+
Until now, Amphion has supported a latent diffusion based text-to-audio model:
|
10 |
+
|
11 |
+
<br>
|
12 |
+
<div align="center">
|
13 |
+
<img src="../../imgs/tta/DiffusionTTA.png" width="65%">
|
14 |
+
</div>
|
15 |
+
<br>
|
16 |
+
|
17 |
+
Similar to [AUDIT](https://arxiv.org/abs/2304.00830), we implement it in two-stage training:
|
18 |
+
1. Training the VAE which is called `AutoencoderKL` in Amphion.
|
19 |
+
2. Training the conditional latent diffusion model which is called `AudioLDM` in Amphion.
|
egs/tta/RECIPE.md
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text-to-Audio with Latent Diffusion Model
|
2 |
+
|
3 |
+
This is the quicktour for training a text-to-audio model with the popular and powerful generative model: [Latent Diffusion Model](https://arxiv.org/abs/2112.10752). Specially, this recipe is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper "[AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models](https://arxiv.org/abs/2304.00830)". You can check the last part of [AUDIT demos](https://audit-demo.github.io/) to see same text-to-audio examples.
|
4 |
+
|
5 |
+
<br>
|
6 |
+
<div align="center">
|
7 |
+
<img src="../../imgs/tta/DiffusionTTA.png" width="65%">
|
8 |
+
</div>
|
9 |
+
<br>
|
10 |
+
|
11 |
+
We train this latent diffusion model in two stages:
|
12 |
+
1. In the first stage, we aims to obtain a high-quality VAE (called `AutoencoderKL` in Amphion), in order that we can project
|
13 |
+
the input mel-spectrograms to an efficient, low-dimensional latent space. Specially, we train the VAE with GAN loss to improve the reconstruction quality.
|
14 |
+
1. In the second stage, we aims to obtain a text-controllable diffusion model (called `AudioLDM` in Amphion). We use U-Net architecture diffusion model, and use T5 encoder as text encoder.
|
15 |
+
|
16 |
+
There are four stages in total for training the text-to-audio model:
|
17 |
+
|
18 |
+
1. Data preparation and processing
|
19 |
+
2. Train the VAE model
|
20 |
+
3. Train the latent diffusion model
|
21 |
+
4. Inference
|
22 |
+
|
23 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
24 |
+
> ```bash
|
25 |
+
> cd Amphion
|
26 |
+
> ```
|
27 |
+
|
28 |
+
## Overview
|
29 |
+
|
30 |
+
```sh
|
31 |
+
# Train the VAE model
|
32 |
+
sh egs/tta/autoencoderkl/run_train.sh
|
33 |
+
|
34 |
+
# Train the latent diffusion model
|
35 |
+
sh egs/tta/audioldm/run_train.sh
|
36 |
+
|
37 |
+
# Inference
|
38 |
+
sh egs/tta/audioldm/run_inference.sh
|
39 |
+
```
|
40 |
+
|
41 |
+
## 1. Data preparation and processing
|
42 |
+
|
43 |
+
### Dataset Download
|
44 |
+
|
45 |
+
We take [AudioCaps](https://audiocaps.github.io/) as an example, AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps).
|
46 |
+
|
47 |
+
<!-- How to download AudioCaps is detailed [here](../datasets/README.md) -->
|
48 |
+
<!-- You can downlaod the dataset [here](https://github.com/cdjkim/audiocaps). -->
|
49 |
+
|
50 |
+
### Data Processing
|
51 |
+
|
52 |
+
- Download AudioCaps dataset to `[Your path to save tta dataset]` and modify `preprocess.processed_dir` in `egs/tta/.../exp_config.json`.
|
53 |
+
|
54 |
+
```json
|
55 |
+
{
|
56 |
+
"dataset": [
|
57 |
+
"AudioCaps"
|
58 |
+
],
|
59 |
+
"preprocess": {
|
60 |
+
// Specify the output root path to save the processed data
|
61 |
+
"processed_dir": "[Your path to save tta dataset]",
|
62 |
+
...
|
63 |
+
}
|
64 |
+
}
|
65 |
+
```
|
66 |
+
|
67 |
+
The folder structure of your downloaded data should be similar to:
|
68 |
+
|
69 |
+
```plaintext
|
70 |
+
.../[Your path to save tta dataset]
|
71 |
+
┣ AudioCpas
|
72 |
+
┃ ┣ wav
|
73 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
|
74 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
|
75 |
+
┃ ┃ ┣ ...
|
76 |
+
```
|
77 |
+
|
78 |
+
- Then you may process the data to mel-specgram and save it as `.npy` format. If you use the data we provide, we have processed all the wav data.
|
79 |
+
|
80 |
+
- Generate a json file to save the metadata, the json file is like:
|
81 |
+
|
82 |
+
```json
|
83 |
+
[
|
84 |
+
{
|
85 |
+
"Dataset": "AudioCaps",
|
86 |
+
"Uid": "---1_cCGK4M_0_10000",
|
87 |
+
"Caption": "Idling car, train blows horn and passes"
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"Dataset": "AudioCaps",
|
91 |
+
"Uid": "---lTs1dxhU_30000_40000",
|
92 |
+
"Caption": "A racing vehicle engine is heard passing by"
|
93 |
+
},
|
94 |
+
...
|
95 |
+
]
|
96 |
+
```
|
97 |
+
- Finally, the folder structure is like:
|
98 |
+
|
99 |
+
```plaintext
|
100 |
+
.../[Your path to save tta dataset]
|
101 |
+
┣ AudioCpas
|
102 |
+
┃ ┣ wav
|
103 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
|
104 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
|
105 |
+
┃ ┃ ┣ ...
|
106 |
+
┃ ┣ mel
|
107 |
+
┃ ┃ ┣ ---1_cCGK4M_0_10000.npy
|
108 |
+
┃ ┃ ┣ ---lTs1dxhU_30000_40000.npy
|
109 |
+
┃ ┃ ┣ ...
|
110 |
+
┃ ┣ train.json
|
111 |
+
┃ ┣ valid.json
|
112 |
+
┃ ┣ ...
|
113 |
+
```
|
114 |
+
|
115 |
+
## 2. Training the VAE Model
|
116 |
+
|
117 |
+
The first stage model is a VAE model trained with GAN loss (called `AutoencoderKL` in Amphion), run the follow commands:
|
118 |
+
|
119 |
+
```sh
|
120 |
+
sh egs/tta/autoencoderkl/run_train.sh
|
121 |
+
```
|
122 |
+
|
123 |
+
## 3. Training the Latent Diffusion Model
|
124 |
+
|
125 |
+
The second stage model is a condition diffusion model with a T5 text encoder (called `AudioLDM` in Amphion), run the following commands:
|
126 |
+
|
127 |
+
```sh
|
128 |
+
sh egs/tta/audioldm/run_train.sh
|
129 |
+
```
|
130 |
+
|
131 |
+
## 4. Inference
|
132 |
+
|
133 |
+
Now you can generate audio with your pre-trained latent diffusion model, run the following commands and modify the `text` argument.
|
134 |
+
|
135 |
+
```sh
|
136 |
+
sh egs/tta/audioldm/run_inference.sh \
|
137 |
+
--text "A man is whistling"
|
138 |
+
```
|
139 |
+
|
140 |
+
## Citations
|
141 |
+
|
142 |
+
```bibtex
|
143 |
+
@article{wang2023audit,
|
144 |
+
title={AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models},
|
145 |
+
author={Wang, Yuancheng and Ju, Zeqian and Tan, Xu and He, Lei and Wu, Zhizheng and Bian, Jiang and Zhao, Sheng},
|
146 |
+
journal={NeurIPS 2023},
|
147 |
+
year={2023}
|
148 |
+
}
|
149 |
+
|
150 |
+
@article{liu2023audioldm,
|
151 |
+
title={{AudioLDM}: Text-to-Audio Generation with Latent Diffusion Models},
|
152 |
+
author={Liu, Haohe and Chen, Zehua and Yuan, Yi and Mei, Xinhao and Liu, Xubo and Mandic, Danilo and Wang, Wenwu and Plumbley, Mark D},
|
153 |
+
journal={Proceedings of the International Conference on Machine Learning},
|
154 |
+
year={2023}
|
155 |
+
}
|
156 |
+
```
|
egs/tta/audioldm/exp_config.json
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tta/audioldm/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"AudioCaps"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "data",
|
9 |
+
// For example: "/home/TTADataset/processed_data"
|
10 |
+
|
11 |
+
// feature
|
12 |
+
"use_spkid": false,
|
13 |
+
"use_uv": false,
|
14 |
+
"use_frame_pitch": false,
|
15 |
+
"use_phone_pitch": false,
|
16 |
+
"use_frame_energy": false,
|
17 |
+
"use_phone_energy": false,
|
18 |
+
"use_mel": false,
|
19 |
+
"use_audio": false,
|
20 |
+
"use_label": false,
|
21 |
+
"use_one_hot": false,
|
22 |
+
// feature for text to audio
|
23 |
+
"use_caption": true,
|
24 |
+
"use_melspec": true,
|
25 |
+
"use_wav": false,
|
26 |
+
// feature dir
|
27 |
+
"melspec_dir": "mel",
|
28 |
+
"wav_dir": "wav"
|
29 |
+
},
|
30 |
+
// Specify the output root path to save model ckpts and logs
|
31 |
+
"log_dir": "ckpts/tta",
|
32 |
+
// For example: "/home/TTADataset/processed_data/logs"
|
33 |
+
|
34 |
+
// model
|
35 |
+
"model": {
|
36 |
+
"audioldm": {
|
37 |
+
"image_size": 32,
|
38 |
+
"in_channels": 4,
|
39 |
+
"out_channels": 4,
|
40 |
+
"model_channels": 256,
|
41 |
+
"attention_resolutions": [4, 2, 1],
|
42 |
+
"num_res_blocks": 2,
|
43 |
+
"channel_mult": [1, 2, 4],
|
44 |
+
"num_heads": 8,
|
45 |
+
"use_spatial_transformer": true,
|
46 |
+
"transformer_depth": 1,
|
47 |
+
"context_dim": 768,
|
48 |
+
"use_checkpoint": true,
|
49 |
+
"legacy": false
|
50 |
+
},
|
51 |
+
"autoencoderkl": {
|
52 |
+
"ch": 128,
|
53 |
+
"ch_mult": [1,1,2,2,4],
|
54 |
+
"num_res_blocks": 2,
|
55 |
+
"in_channels": 1,
|
56 |
+
"z_channels": 4,
|
57 |
+
"out_ch": 1,
|
58 |
+
"double_z": true
|
59 |
+
},
|
60 |
+
"noise_scheduler": {
|
61 |
+
"num_train_timesteps": 1000,
|
62 |
+
"beta_start": 0.00085,
|
63 |
+
"beta_end": 0.012,
|
64 |
+
"beta_schedule": "scaled_linear",
|
65 |
+
"clip_sample": false,
|
66 |
+
"steps_offset": 1,
|
67 |
+
"set_alpha_to_one": false,
|
68 |
+
"skip_prk_steps": true,
|
69 |
+
"prediction_type": "epsilon"
|
70 |
+
},
|
71 |
+
"autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt"
|
72 |
+
},
|
73 |
+
|
74 |
+
// train
|
75 |
+
"train": {
|
76 |
+
"adam": {
|
77 |
+
"lr": 5.0e-5
|
78 |
+
},
|
79 |
+
"ddp": false,
|
80 |
+
"random_seed": 12345,
|
81 |
+
"batch_size": 12,
|
82 |
+
"epochs": 50000,
|
83 |
+
"max_steps": 1000000,
|
84 |
+
"total_training_steps": 800000,
|
85 |
+
"save_summary_steps": 1000,
|
86 |
+
"save_checkpoints_steps": 5000,
|
87 |
+
"valid_interval": 5000,
|
88 |
+
"keep_checkpoint_max": 100
|
89 |
+
}
|
90 |
+
}
|
egs/tta/audioldm/exp_config_base.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/audioldm.json",
|
3 |
+
"model_type": "AudioLDM",
|
4 |
+
"dataset": [
|
5 |
+
"AudioCaps"
|
6 |
+
],
|
7 |
+
"preprocess": {
|
8 |
+
"train_file": "train.json",
|
9 |
+
"valid_file": "vaild.json"
|
10 |
+
}
|
11 |
+
}
|
egs/tta/audioldm/exp_config_latent_4_10_78.json
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tta/audioldm/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"AudioCaps"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "data",
|
9 |
+
|
10 |
+
// feature
|
11 |
+
"use_spkid": false,
|
12 |
+
"use_uv": false,
|
13 |
+
"use_frame_pitch": false,
|
14 |
+
"use_phone_pitch": false,
|
15 |
+
"use_frame_energy": false,
|
16 |
+
"use_phone_energy": false,
|
17 |
+
"use_mel": false,
|
18 |
+
"use_audio": false,
|
19 |
+
"use_label": false,
|
20 |
+
"use_one_hot": false,
|
21 |
+
// feature for text to audio
|
22 |
+
"use_caption": true,
|
23 |
+
"use_melspec": true,
|
24 |
+
"use_wav": false,
|
25 |
+
// feature dir
|
26 |
+
"melspec_dir": "mel",
|
27 |
+
"wav_dir": "wav"
|
28 |
+
},
|
29 |
+
// Specify the output root path to save model ckpts and logs
|
30 |
+
"log_dir": "ckpts/tta",
|
31 |
+
|
32 |
+
// model
|
33 |
+
"model": {
|
34 |
+
"audioldm": {
|
35 |
+
"image_size": 32,
|
36 |
+
"in_channels": 4,
|
37 |
+
"out_channels": 4,
|
38 |
+
"model_channels": 256,
|
39 |
+
"attention_resolutions": [4, 2, 1],
|
40 |
+
"num_res_blocks": 2,
|
41 |
+
"channel_mult": [1, 2, 4],
|
42 |
+
"num_heads": 8,
|
43 |
+
"use_spatial_transformer": true,
|
44 |
+
"transformer_depth": 1,
|
45 |
+
"context_dim": 768,
|
46 |
+
"use_checkpoint": true,
|
47 |
+
"legacy": false
|
48 |
+
},
|
49 |
+
"autoencoderkl": {
|
50 |
+
"ch": 128,
|
51 |
+
"ch_mult": [1,2,2,4],
|
52 |
+
"num_res_blocks": 2,
|
53 |
+
"in_channels": 1,
|
54 |
+
"z_channels": 4,
|
55 |
+
"out_ch": 1,
|
56 |
+
"double_z": true
|
57 |
+
},
|
58 |
+
"noise_scheduler": {
|
59 |
+
"num_train_timesteps": 1000,
|
60 |
+
"beta_start": 0.00085,
|
61 |
+
"beta_end": 0.012,
|
62 |
+
"beta_schedule": "scaled_linear",
|
63 |
+
"clip_sample": false,
|
64 |
+
"steps_offset": 1,
|
65 |
+
"set_alpha_to_one": false,
|
66 |
+
"skip_prk_steps": true,
|
67 |
+
"prediction_type": "epsilon"
|
68 |
+
},
|
69 |
+
"autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt"
|
70 |
+
},
|
71 |
+
|
72 |
+
// train
|
73 |
+
"train": {
|
74 |
+
"adam": {
|
75 |
+
"lr": 2.0e-5
|
76 |
+
},
|
77 |
+
"ddp": false,
|
78 |
+
"random_seed": 12345,
|
79 |
+
"batch_size": 12,
|
80 |
+
"epochs": 50000,
|
81 |
+
"max_steps": 1000000,
|
82 |
+
"total_training_steps": 800000,
|
83 |
+
"save_summary_steps": 1000,
|
84 |
+
"save_checkpoints_steps": 5000,
|
85 |
+
"valid_interval": 5000,
|
86 |
+
"keep_checkpoint_max": 100
|
87 |
+
}
|
88 |
+
}
|
egs/tta/audioldm/run_inference.sh
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config.json"
|
16 |
+
exp_name="audioldm_debug_latent_size_4_5_39"
|
17 |
+
checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_5_39/checkpoints/step-0570000_loss-0.2521.pt"
|
18 |
+
output_dir="$work_dir/temp"
|
19 |
+
vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
|
20 |
+
vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
|
21 |
+
num_steps=200
|
22 |
+
guidance_scale=4.0
|
23 |
+
|
24 |
+
export CUDA_VISIBLE_DEVICES="0"
|
25 |
+
|
26 |
+
######## Parse Command Line Arguments ###########
|
27 |
+
while [[ $# -gt 0 ]]
|
28 |
+
do
|
29 |
+
key="$1"
|
30 |
+
|
31 |
+
case $key in
|
32 |
+
--text)
|
33 |
+
text="$2"
|
34 |
+
shift # past argument
|
35 |
+
shift # past value
|
36 |
+
;;
|
37 |
+
*) # unknown option
|
38 |
+
shift # past argument
|
39 |
+
;;
|
40 |
+
esac
|
41 |
+
done
|
42 |
+
|
43 |
+
######## Run inference ###########
|
44 |
+
python "${work_dir}"/bins/tta/inference.py \
|
45 |
+
--config=$exp_config \
|
46 |
+
--checkpoint_path=$checkpoint_path \
|
47 |
+
--text="$text" \
|
48 |
+
--vocoder_path=$vocoder_path \
|
49 |
+
--vocoder_config_path=$vocoder_config_path \
|
50 |
+
--num_steps=$num_steps \
|
51 |
+
--guidance_scale=$guidance_scale \
|
52 |
+
--output_dir=$output_dir
|
egs/tta/audioldm/run_inference_latent_4_10_78.sh
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config_v2.json"
|
16 |
+
exp_name="audioldm_debug_latent_size_4_10_78"
|
17 |
+
checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_10_78/checkpoints/step-0325000_loss-0.1936.pt"
|
18 |
+
output_dir="$work_dir/temp"
|
19 |
+
vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
|
20 |
+
vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
|
21 |
+
num_steps=200
|
22 |
+
guidance_scale=4.0
|
23 |
+
|
24 |
+
export CUDA_VISIBLE_DEVICES="0"
|
25 |
+
|
26 |
+
######## Parse Command Line Arguments ###########
|
27 |
+
while [[ $# -gt 0 ]]
|
28 |
+
do
|
29 |
+
key="$1"
|
30 |
+
|
31 |
+
case $key in
|
32 |
+
--text)
|
33 |
+
text="$2"
|
34 |
+
shift # past argument
|
35 |
+
shift # past value
|
36 |
+
;;
|
37 |
+
*) # unknown option
|
38 |
+
shift # past argument
|
39 |
+
;;
|
40 |
+
esac
|
41 |
+
done
|
42 |
+
|
43 |
+
######## Run inference ###########
|
44 |
+
python "${work_dir}"/bins/tta/inference.py \
|
45 |
+
--config=$exp_config \
|
46 |
+
--checkpoint_path=$checkpoint_path \
|
47 |
+
--text="A man is whistling" \
|
48 |
+
--vocoder_path=$vocoder_path \
|
49 |
+
--vocoder_config_path=$vocoder_config_path \
|
50 |
+
--num_steps=$num_steps \
|
51 |
+
--guidance_scale=$guidance_scale \
|
52 |
+
--output_dir=$output_dir \
|
egs/tta/audioldm/run_train.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config.json"
|
16 |
+
exp_name="audioldm_debug_latent_size_4_5_39"
|
17 |
+
|
18 |
+
num_workers=8
|
19 |
+
export CUDA_VISIBLE_DEVICES="0"
|
20 |
+
|
21 |
+
######## Train Model ###########
|
22 |
+
python "${work_dir}"/bins/tta/train_tta.py \
|
23 |
+
--config=$exp_config \
|
24 |
+
--num_workers=$num_workers \
|
25 |
+
--exp_name=$exp_name \
|
26 |
+
--stdout_interval=25 \
|
egs/tta/audioldm/run_train_latent_4_10_78.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config_latent_4_10_78.json"
|
16 |
+
exp_name="audioldm_debug_latent_size_4_10_78"
|
17 |
+
|
18 |
+
num_workers=8
|
19 |
+
export CUDA_VISIBLE_DEVICES="0"
|
20 |
+
|
21 |
+
######## Train Model ###########
|
22 |
+
python "${work_dir}"/bins/tta/train_tta.py \
|
23 |
+
--config=$exp_config \
|
24 |
+
--num_workers=$num_workers \
|
25 |
+
--exp_name=$exp_name \
|
26 |
+
--stdout_interval=25 \
|
egs/tta/autoencoderkl/exp_config.json
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tta/autoencoderkl/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"AudioCaps"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "data",
|
9 |
+
|
10 |
+
// feature
|
11 |
+
"use_spk": false,
|
12 |
+
"use_spkid": false,
|
13 |
+
"use_uv": false,
|
14 |
+
"use_frame_pitch": false,
|
15 |
+
"use_phone_pitch": false,
|
16 |
+
"use_frame_energy": false,
|
17 |
+
"use_phone_energy": false,
|
18 |
+
"use_mel": false,
|
19 |
+
"use_audio": false,
|
20 |
+
"use_label": false,
|
21 |
+
"use_one_hot": false,
|
22 |
+
// feature for text to audio
|
23 |
+
"use_caption": true,
|
24 |
+
"use_melspec": true,
|
25 |
+
"use_wav": false,
|
26 |
+
// feature dir
|
27 |
+
"melspec_dir": "mel",
|
28 |
+
"wav_dir": "wav"
|
29 |
+
},
|
30 |
+
// Specify the output root path to save model ckpts and logs
|
31 |
+
"log_dir": "ckpts/tta",
|
32 |
+
|
33 |
+
// train
|
34 |
+
"train": {
|
35 |
+
"adam": {
|
36 |
+
"lr": 4.0e-5
|
37 |
+
},
|
38 |
+
"ddp": false,
|
39 |
+
"random_seed": 12345,
|
40 |
+
"batch_size": 12,
|
41 |
+
"epochs": 50000,
|
42 |
+
"max_steps": 1000000,
|
43 |
+
"total_training_steps": 800000,
|
44 |
+
"save_summary_steps": 1000,
|
45 |
+
"save_checkpoints_steps": 5000,
|
46 |
+
"valid_interval": 5000,
|
47 |
+
"keep_checkpoint_max": 100
|
48 |
+
}
|
49 |
+
}
|
egs/tta/autoencoderkl/exp_config_base.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/autoencoderkl.json",
|
3 |
+
"model_type": "AutoencoderKL",
|
4 |
+
"dataset": [
|
5 |
+
"AudioCaps"
|
6 |
+
],
|
7 |
+
"preprocess": {
|
8 |
+
"train_file": "train.json",
|
9 |
+
"valid_file": "vaild.json"
|
10 |
+
}
|
11 |
+
}
|
egs/tta/autoencoderkl/exp_config_latent_4_10_78.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tta/autoencoderkl/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"AudioCaps"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "data",
|
9 |
+
|
10 |
+
// feature
|
11 |
+
"use_spkid": false,
|
12 |
+
"use_uv": false,
|
13 |
+
"use_frame_pitch": false,
|
14 |
+
"use_phone_pitch": false,
|
15 |
+
"use_frame_energy": false,
|
16 |
+
"use_phone_energy": false,
|
17 |
+
"use_mel": false,
|
18 |
+
"use_audio": false,
|
19 |
+
"use_label": false,
|
20 |
+
"use_one_hot": false,
|
21 |
+
// feature for text to audio
|
22 |
+
"use_caption": true,
|
23 |
+
"use_melspec": true,
|
24 |
+
"use_wav": false,
|
25 |
+
// feature dir
|
26 |
+
"melspec_dir": "mel",
|
27 |
+
"wav_dir": "wav"
|
28 |
+
},
|
29 |
+
// Specify the output root path to save model ckpts and logs
|
30 |
+
"log_dir": "ckpts/tta",
|
31 |
+
|
32 |
+
"model": {
|
33 |
+
"autoencoderkl": {
|
34 |
+
"ch": 128,
|
35 |
+
"ch_mult": [1,2,2,4],
|
36 |
+
"num_res_blocks": 2,
|
37 |
+
"in_channels": 1,
|
38 |
+
"z_channels": 4,
|
39 |
+
"out_ch": 1,
|
40 |
+
"double_z": true
|
41 |
+
}
|
42 |
+
},
|
43 |
+
// train
|
44 |
+
"train": {
|
45 |
+
"adam": {
|
46 |
+
"lr": 4.0e-5
|
47 |
+
},
|
48 |
+
"ddp": false,
|
49 |
+
"random_seed": 12345,
|
50 |
+
"batch_size": 12,
|
51 |
+
"epochs": 50000,
|
52 |
+
"max_steps": 1000000,
|
53 |
+
"total_training_steps": 800000,
|
54 |
+
"save_summary_steps": 1000,
|
55 |
+
"save_checkpoints_steps": 5000,
|
56 |
+
"valid_interval": 5000,
|
57 |
+
"keep_checkpoint_max": 100
|
58 |
+
}
|
59 |
+
}
|
egs/tta/autoencoderkl/run_train.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config.json"
|
16 |
+
exp_name="autoencoder_kl_debug"
|
17 |
+
|
18 |
+
num_workers=8
|
19 |
+
export CUDA_VISIBLE_DEVICES="0"
|
20 |
+
|
21 |
+
######## Train Model ###########
|
22 |
+
python "${work_dir}"/bins/tta/train_tta.py \
|
23 |
+
--config=$exp_config \
|
24 |
+
--num_workers=$num_workers \
|
25 |
+
--exp_name=$exp_name \
|
26 |
+
--stdout_interval=25 \
|
egs/tta/autoencoderkl/run_train_latent_4_10_78.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
######## Set Experiment Configuration ###########
|
15 |
+
exp_config="$exp_dir/exp_config_latent_4_10_78.json"
|
16 |
+
exp_name="autoencoder_kl_debug_latent_size_4_10_78"
|
17 |
+
|
18 |
+
num_workers=8
|
19 |
+
export CUDA_VISIBLE_DEVICES="0"
|
20 |
+
|
21 |
+
######## Train Model ###########
|
22 |
+
python "${work_dir}"/bins/tta/train_tta.py \
|
23 |
+
--config=$exp_config \
|
24 |
+
--num_workers=$num_workers \
|
25 |
+
--exp_name=$exp_name \
|
26 |
+
--stdout_interval=25 \
|
egs/tts/FastSpeech2/README.md
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# FastSpeech2 Recipe
|
3 |
+
|
4 |
+
In this recipe, we will show how to train [FastSpeech2](https://openreview.net/forum?id=piLPYqxtWuA) using Amphion's infrastructure. FastSpeech2 is a non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
|
5 |
+
|
6 |
+
There are four stages in total:
|
7 |
+
|
8 |
+
1. Data preparation
|
9 |
+
2. Features extraction
|
10 |
+
3. Training
|
11 |
+
4. Inference
|
12 |
+
|
13 |
+
> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
|
14 |
+
> ```bash
|
15 |
+
> cd Amphion
|
16 |
+
> ```
|
17 |
+
|
18 |
+
## 1. Data Preparation
|
19 |
+
|
20 |
+
### Dataset Download
|
21 |
+
You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
|
22 |
+
|
23 |
+
### Configuration
|
24 |
+
|
25 |
+
After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
|
26 |
+
|
27 |
+
```json
|
28 |
+
"dataset": [
|
29 |
+
"LJSpeech",
|
30 |
+
],
|
31 |
+
"dataset_path": {
|
32 |
+
// TODO: Fill in your dataset path
|
33 |
+
"LJSpeech": "[LJSpeech dataset path]",
|
34 |
+
},
|
35 |
+
```
|
36 |
+
|
37 |
+
## 2. Features Extraction
|
38 |
+
|
39 |
+
### Configuration
|
40 |
+
|
41 |
+
Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
|
42 |
+
|
43 |
+
```json
|
44 |
+
// TODO: Fill in the output log path
|
45 |
+
"log_dir": "ckpts/tts",
|
46 |
+
"preprocess": {
|
47 |
+
// TODO: Fill in the output data path
|
48 |
+
"processed_dir": "data",
|
49 |
+
...
|
50 |
+
},
|
51 |
+
```
|
52 |
+
|
53 |
+
### Run
|
54 |
+
|
55 |
+
Run the `run.sh` as the preproces stage (set `--stage 1`):
|
56 |
+
|
57 |
+
```bash
|
58 |
+
sh egs/tts/FastSpeech2/run.sh --stage 1
|
59 |
+
```
|
60 |
+
|
61 |
+
## 3. Training
|
62 |
+
|
63 |
+
### Configuration
|
64 |
+
|
65 |
+
We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
|
66 |
+
|
67 |
+
```
|
68 |
+
"train": {
|
69 |
+
"batch_size": 16,
|
70 |
+
}
|
71 |
+
```
|
72 |
+
|
73 |
+
### Run
|
74 |
+
|
75 |
+
Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`.
|
76 |
+
|
77 |
+
```bash
|
78 |
+
sh egs/tts/FastSpeech2/run.sh --stage 2 --name [YourExptName]
|
79 |
+
```
|
80 |
+
|
81 |
+
> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
|
82 |
+
|
83 |
+
|
84 |
+
## 4. Inference
|
85 |
+
|
86 |
+
### Configuration
|
87 |
+
|
88 |
+
For inference, you need to specify the following configurations when running `run.sh`:
|
89 |
+
|
90 |
+
|
91 |
+
| Parameters | Description | Example |
|
92 |
+
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
93 |
+
| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `ckpts/tts/[YourExptName]` |
|
94 |
+
| `--infer_output_dir` | The output directory to save inferred audios. | `ckpts/tts/[YourExptName]/result` |
|
95 |
+
| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
|
96 |
+
| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. |
|
97 |
+
| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. |
|
98 |
+
| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
|
99 |
+
|
100 |
+
### Run
|
101 |
+
For example, if you want to generate speech of all testing set split from LJSpeech, just run:
|
102 |
+
|
103 |
+
```bash
|
104 |
+
sh egs/tts/FastSpeech2/run.sh --stage 3 \
|
105 |
+
--infer_expt_dir ckpts/tts/[YourExptName] \
|
106 |
+
--infer_output_dir ckpts/tts/[YourExptName]/result \
|
107 |
+
--infer_mode "batch" \
|
108 |
+
--infer_dataset "LJSpeech" \
|
109 |
+
--infer_testing_set "test"
|
110 |
+
```
|
111 |
+
|
112 |
+
Or, if you want to generate a single clip of speech from a given text, just run:
|
113 |
+
|
114 |
+
```bash
|
115 |
+
sh egs/tts/FastSpeech2/run.sh --stage 3 \
|
116 |
+
--infer_expt_dir ckpts/tts/[YourExptName] \
|
117 |
+
--infer_output_dir ckpts/tts/[YourExptName]/result \
|
118 |
+
--infer_mode "single" \
|
119 |
+
--infer_text "This is a clip of generated speech with the given text from a TTS model."
|
120 |
+
```
|
121 |
+
|
122 |
+
We will release a pre-trained FastSpeech2 model trained on LJSpeech. So you can download the pre-trained model and generate speech following the above inference instruction.
|
123 |
+
|
124 |
+
|
125 |
+
```bibtex
|
126 |
+
@inproceedings{ren2020fastspeech,
|
127 |
+
title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech},
|
128 |
+
author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan},
|
129 |
+
booktitle={International Conference on Learning Representations},
|
130 |
+
year={2020}
|
131 |
+
}
|
132 |
+
```
|
egs/tts/FastSpeech2/exp_config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "config/fs2.json",
|
3 |
+
"model_type": "FastSpeech2",
|
4 |
+
"dataset": [
|
5 |
+
"LJSpeech"
|
6 |
+
],
|
7 |
+
"dataset_path": {
|
8 |
+
// TODO: Fill in your dataset path
|
9 |
+
"LJSpeech": "[LJSpeech dataset path]"
|
10 |
+
},
|
11 |
+
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
|
12 |
+
"log_dir": "ckpts/tts",
|
13 |
+
"preprocess": {
|
14 |
+
// TODO: Fill in the output data path. The default value is "Amphion/data"
|
15 |
+
"processed_dir": "data",
|
16 |
+
"sample_rate": 22050,
|
17 |
+
},
|
18 |
+
"train": {
|
19 |
+
"batch_size": 16,
|
20 |
+
}
|
21 |
+
}
|
egs/tts/FastSpeech2/prepare_mfa.sh
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#!/bin/bash
|
7 |
+
mkdir mfa
|
8 |
+
cd mfa
|
9 |
+
wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz
|
10 |
+
tar -zxvf montreal-forced-aligner_linux.tar.gz
|
11 |
+
cd mfa
|
12 |
+
mkdir lexicon
|
13 |
+
cd lexicon
|
14 |
+
wget http://www.openslr.org/resources/11/librispeech-lexicon.txt
|
egs/tts/FastSpeech2/run.sh
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023 Amphion.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
######## Build Experiment Environment ###########
|
7 |
+
exp_dir=$(cd `dirname $0`; pwd)
|
8 |
+
work_dir=$(dirname $(dirname $(dirname $exp_dir)))
|
9 |
+
|
10 |
+
export WORK_DIR=$work_dir
|
11 |
+
export PYTHONPATH=$work_dir
|
12 |
+
export PYTHONIOENCODING=UTF-8
|
13 |
+
|
14 |
+
cd $work_dir/modules/monotonic_align
|
15 |
+
mkdir -p monotonic_align
|
16 |
+
python setup.py build_ext --inplace
|
17 |
+
cd $work_dir
|
18 |
+
|
19 |
+
mfa_dir=$work_dir/mfa
|
20 |
+
echo $mfa_dir
|
21 |
+
|
22 |
+
######## Parse the Given Parameters from the Commond ###########
|
23 |
+
# options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@")
|
24 |
+
options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@")
|
25 |
+
eval set -- "$options"
|
26 |
+
|
27 |
+
while true; do
|
28 |
+
case $1 in
|
29 |
+
# Experimental Configuration File
|
30 |
+
-c | --config) shift; exp_config=$1 ; shift ;;
|
31 |
+
# Experimental Name
|
32 |
+
-n | --name) shift; exp_name=$1 ; shift ;;
|
33 |
+
# Running Stage
|
34 |
+
-s | --stage) shift; running_stage=$1 ; shift ;;
|
35 |
+
# Visible GPU machines. The default value is "0".
|
36 |
+
--gpu) shift; gpu=$1 ; shift ;;
|
37 |
+
|
38 |
+
# [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
|
39 |
+
--infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
|
40 |
+
# [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
|
41 |
+
--infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
|
42 |
+
# [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech.
|
43 |
+
--infer_mode) shift; infer_mode=$1 ; shift ;;
|
44 |
+
# [Only for Inference] The inference dataset. It is only used when the inference model is "batch".
|
45 |
+
--infer_dataset) shift; infer_dataset=$1 ; shift ;;
|
46 |
+
# [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set.
|
47 |
+
--infer_testing_set) shift; infer_testing_set=$1 ; shift ;;
|
48 |
+
# [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single".
|
49 |
+
--infer_text) shift; infer_text=$1 ; shift ;;
|
50 |
+
|
51 |
+
--) shift ; break ;;
|
52 |
+
*) echo "Invalid option: $1" exit 1 ;;
|
53 |
+
esac
|
54 |
+
done
|
55 |
+
|
56 |
+
|
57 |
+
### Value check ###
|
58 |
+
if [ -z "$running_stage" ]; then
|
59 |
+
echo "[Error] Please specify the running stage"
|
60 |
+
exit 1
|
61 |
+
fi
|
62 |
+
|
63 |
+
if [ -z "$exp_config" ]; then
|
64 |
+
exp_config="${exp_dir}"/exp_config.json
|
65 |
+
fi
|
66 |
+
echo "Exprimental Configuration File: $exp_config"
|
67 |
+
|
68 |
+
if [ -z "$gpu" ]; then
|
69 |
+
gpu="0"
|
70 |
+
fi
|
71 |
+
|
72 |
+
######## Features Extraction ###########
|
73 |
+
if [ $running_stage -eq 1 ]; then
|
74 |
+
if [ ! -d "$mfa_dir" ]; then
|
75 |
+
bash ${exp_dir}/prepare_mfa.sh
|
76 |
+
fi
|
77 |
+
CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \
|
78 |
+
--config=$exp_config \
|
79 |
+
--num_workers=4 \
|
80 |
+
--prepare_alignment=true
|
81 |
+
fi
|
82 |
+
|
83 |
+
######## Training ###########
|
84 |
+
if [ $running_stage -eq 2 ]; then
|
85 |
+
if [ -z "$exp_name" ]; then
|
86 |
+
echo "[Error] Please specify the experiments name"
|
87 |
+
exit 1
|
88 |
+
fi
|
89 |
+
echo "Exprimental Name: $exp_name"
|
90 |
+
|
91 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \
|
92 |
+
--config $exp_config \
|
93 |
+
--exp_name $exp_name \
|
94 |
+
--log_level debug
|
95 |
+
fi
|
96 |
+
|
97 |
+
######## Inference ###########
|
98 |
+
if [ $running_stage -eq 3 ]; then
|
99 |
+
if [ -z "$infer_expt_dir" ]; then
|
100 |
+
echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
|
101 |
+
exit 1
|
102 |
+
fi
|
103 |
+
|
104 |
+
if [ -z "$infer_output_dir" ]; then
|
105 |
+
infer_output_dir="$expt_dir/result"
|
106 |
+
fi
|
107 |
+
|
108 |
+
if [ -z "$infer_mode" ]; then
|
109 |
+
echo "[Error] Please specify the inference mode, e.g., "batch", "single""
|
110 |
+
exit 1
|
111 |
+
fi
|
112 |
+
|
113 |
+
if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then
|
114 |
+
echo "[Error] Please specify the dataset used in inference when the inference mode is batch"
|
115 |
+
exit 1
|
116 |
+
fi
|
117 |
+
|
118 |
+
if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then
|
119 |
+
echo "[Error] Please specify the testing set used in inference when the inference mode is batch"
|
120 |
+
exit 1
|
121 |
+
fi
|
122 |
+
|
123 |
+
if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then
|
124 |
+
echo "[Error] Please specify the text to be synthesized when the inference mode is single"
|
125 |
+
exit 1
|
126 |
+
fi
|
127 |
+
|
128 |
+
if [ "$infer_mode" = "single" ]; then
|
129 |
+
echo 'Text: ' ${infer_text}
|
130 |
+
infer_dataset=None
|
131 |
+
infer_testing_set=None
|
132 |
+
elif [ "$infer_mode" = "batch" ]; then
|
133 |
+
infer_text=''
|
134 |
+
fi
|
135 |
+
|
136 |
+
|
137 |
+
CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
|
138 |
+
--config $exp_config \
|
139 |
+
--acoustics_dir $infer_expt_dir \
|
140 |
+
--output_dir $infer_output_dir \
|
141 |
+
--mode $infer_mode \
|
142 |
+
--dataset $infer_dataset \
|
143 |
+
--testing_set $infer_testing_set \
|
144 |
+
--text "$infer_text" \
|
145 |
+
--log_level debug \
|
146 |
+
--vocoder_dir /mntnfs/lee_data1/chenxi/processed_data/ljspeech/model_ckpt/hifigan/checkpoints
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
fi
|
egs/tts/NaturalSpeech2/exp_config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_config": "egs/tts/NaturalSpeech2/exp_config_base.json",
|
3 |
+
"dataset": [
|
4 |
+
"LibriTTS"
|
5 |
+
],
|
6 |
+
"preprocess": {
|
7 |
+
// Specify the output root path to save the processed data
|
8 |
+
"processed_dir": "[LibriTTS dataset path]",
|
9 |
+
"train_file": "train.json",
|
10 |
+
"valid_file": "test.json",
|
11 |
+
"read_metadata": true,
|
12 |
+
"metadata_dir": "metadata"
|
13 |
+
},
|
14 |
+
// Specify the output root path to save model ckpts and logs
|
15 |
+
"log_dir": "ckpts/tts",
|
16 |
+
"train": {
|
17 |
+
// New trainer and Accelerator
|
18 |
+
"gradient_accumulation_step": 1,
|
19 |
+
"tracker": ["tensorboard"],
|
20 |
+
"max_epoch": 5000,
|
21 |
+
"save_checkpoint_stride": [1],
|
22 |
+
"keep_last": [1000],
|
23 |
+
"run_eval": [true],
|
24 |
+
"dataloader": {
|
25 |
+
"num_worker": 16,
|
26 |
+
"pin_memory": true
|
27 |
+
},
|
28 |
+
"adam": {
|
29 |
+
"lr": 1.0e-4
|
30 |
+
},
|
31 |
+
"use_dynamic_batchsize": true,
|
32 |
+
"batch_size": 8,
|
33 |
+
"max_tokens": 7500,
|
34 |
+
"max_sentences": 32,
|
35 |
+
"lr_warmup_steps": 5000,
|
36 |
+
"lr_scheduler": "cosine",
|
37 |
+
"num_train_steps": 800000
|
38 |
+
}
|
39 |
+
}
|