Spaces:

ChrisPreston
/

diff-svc_minato_aqua

Build error

App Files Files Community

ChrisPreston commited on Feb 8, 2023

Commit

93f4bab

1 Parent(s): ebe9a08

Upload 95 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
aqua/clean_model_ckpt_steps_100000.ckpt +3 -0
aqua/config.yaml +457 -0
checkpoints/0102_xiaoma_pe/config.yaml +172 -0
checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +3 -0
checkpoints/hubert/hubert.onnx +3 -0
checkpoints/hubert/hubert_soft.pt +3 -0
checkpoints/nsf_hifigan/NOTICE.txt +74 -0
checkpoints/nsf_hifigan/config.json +38 -0
checkpoints/nsf_hifigan/model +3 -0
infer.py +81 -0
infer_tools/__pycache__/f0_static.cpython-38.pyc +0 -0
infer_tools/__pycache__/infer_tool.cpython-38.pyc +0 -0
infer_tools/__pycache__/infer_tool_beta.cpython-38.pyc +0 -0
infer_tools/__pycache__/slicer.cpython-38.pyc +0 -0
infer_tools/__pycache__/trans_key.cpython-38.pyc +0 -0
infer_tools/f0_static.py +116 -0
infer_tools/f0_temp.json +0 -0
infer_tools/infer_tool.py +201 -0
infer_tools/infer_tool_beta.py +229 -0
infer_tools/slicer.py +142 -0
infer_tools/trans_key.py +67 -0
modules/__pycache__/encoder.cpython-310.pyc +0 -0
modules/__pycache__/encoder.cpython-38.pyc +0 -0
modules/commons/__pycache__/common_layers.cpython-310.pyc +0 -0
modules/commons/__pycache__/common_layers.cpython-38.pyc +0 -0
modules/commons/__pycache__/ssim.cpython-310.pyc +0 -0
modules/commons/__pycache__/ssim.cpython-38.pyc +0 -0
modules/commons/common_layers.py +675 -0
modules/commons/ssim.py +84 -0
modules/diff/__pycache__/diffusion.cpython-310.pyc +0 -0
modules/diff/__pycache__/diffusion.cpython-38.pyc +0 -0
modules/diff/__pycache__/net.cpython-310.pyc +0 -0
modules/diff/__pycache__/net.cpython-38.pyc +0 -0
modules/diff/diffusion.py +312 -0
modules/diff/net.py +135 -0
modules/encoder.py +208 -0
modules/hubert/__pycache__/cn_hubert.cpython-38.pyc +0 -0
modules/hubert/__pycache__/hubert_model.cpython-38.pyc +0 -0
modules/hubert/__pycache__/hubert_onnx.cpython-38.pyc +0 -0
modules/hubert/cn_hubert.py +40 -0
modules/hubert/hubert_model.py +243 -0
modules/hubert/hubert_onnx.py +19 -0
modules/nsf_hifigan/__pycache__/env.cpython-310.pyc +0 -0
modules/nsf_hifigan/__pycache__/env.cpython-38.pyc +0 -0
modules/nsf_hifigan/__pycache__/models.cpython-310.pyc +0 -0
modules/nsf_hifigan/__pycache__/models.cpython-38.pyc +0 -0
modules/nsf_hifigan/__pycache__/nvSTFT.cpython-310.pyc +0 -0
modules/nsf_hifigan/__pycache__/nvSTFT.cpython-38.pyc +0 -0
modules/nsf_hifigan/__pycache__/utils.cpython-310.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/nsf_hifigan/model filter=lfs diff=lfs merge=lfs -text

aqua/clean_model_ckpt_steps_100000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14d1e9bf1dde30fcb397ebf91e61e77fc34cf22f6d1d6fd112eba57113a75795
+size 227124201

aqua/config.yaml ADDED Viewed

	@@ -0,0 +1,457 @@

+K_step: 1000
+accumulate_grad_batches: 1
+audio_num_mel_bins: 128
+audio_sample_rate: 44100
+binarization_args:
+  shuffle: false
+  with_align: true
+  with_f0: true
+  with_hubert: true
+  with_spk_embed: false
+  with_wav: false
+binarizer_cls: preprocessing.SVCpre.SVCBinarizer
+binary_data_dir: data/binary/aquapre
+check_val_every_n_epoch: 10
+choose_test_manually: false
+clip_grad_norm: 1
+config_path: F:\diff-svc-main\training\config_nsf.yaml
+content_cond_steps: []
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+datasets:
+- opencpop
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decay_steps: 20000
+decoder_type: fft
+dict_dir: ''
+diff_decoder_type: wavenet
+diff_loss_type: l2
+dilation_cycle_length: 4
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 5
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: false
+f0_bin: 256
+f0_max: 1100.0
+f0_min: 40.0
+f0_static: '{"28.0": 0.07, "29.0": 0.03, "31.0": 0.05, "32.0": 0.08, "33.0": 0.12,
+  "34.0": 0.02, "35.0": 0.06, "36.0": 0.02, "37.0": 0.01, "38.0": 0.1, "39.0": 0.05,
+  "40.0": 0.09, "41.0": 0.14, "42.0": 0.16, "43.0": 0.03, "44.0": 0.42, "45.0": 0.74,
+  "46.0": 1.13, "47.0": 1.49, "48.0": 1.76, "49.0": 2.59, "50.0": 3.03, "51.0": 2.71,
+  "52.0": 1.93, "53.0": 1.11, "54.0": 0.78, "55.0": 3.33, "56.0": 20.38, "57.0": 69.6,
+  "58.0": 167.04, "59.0": 245.1, "60.0": 318.87, "61.0": 373.41, "62.0": 434.86, "63.0":
+  415.63, "64.0": 448.97, "65.0": 452.99, "66.0": 474.88, "67.0": 471.54, "68.0":
+  455.78, "69.0": 421.71, "70.0": 372.06, "71.0": 323.85, "72.0": 292.8, "73.0": 238.94,
+  "74.0": 190.5, "75.0": 132.86, "76.0": 88.03, "77.0": 53.16, "78.0": 32.96, "79.0":
+  23.66, "80.0": 14.74, "81.0": 8.54, "82.0": 5.0, "83.0": 3.32, "84.0": 2.29, "85.0":
+  0.91, "total_time": 6576.43}'
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 2048
+fmax: 16000
+fmin: 40
+fs2_ckpt: ''
+gaussian_start: true
+gen_dir_name: ''
+gen_tgt_spk_id: -1
+hidden_size: 256
+hop_size: 512
+hubert_gpu: true
+hubert_path: checkpoints/hubert/hubert_soft.pt
+infer: false
+keep_bins: 128
+lambda_commit: 0.25
+lambda_energy: 0.0
+lambda_f0: 1.0
+lambda_ph_dur: 0.3
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 0.0008
+max_beta: 0.02
+max_epochs: 3000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 42000
+max_input_tokens: 6000
+max_sentences: 88
+max_tokens: 128000
+max_updates: 1000000
+mel_loss: ssim:0.5|l1:0.5
+mel_vmax: 1.5
+mel_vmin: -6.0
+min_level_db: -120
+no_fs2: true
+norm_type: gn
+num_ckpt_keep: 10
+num_heads: 2
+num_sanity_val_steps: 1
+num_spk: 1
+num_test_samples: 0
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pe_ckpt: checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
+pe_enable: false
+perform_enhance: true
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor: parselmouth
+pitch_loss: l2
+pitch_norm: log
+pitch_type: frame
+pndm_speedup: 10
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  txt_processor: zh_g2pM
+  use_sox: true
+  use_tone: false
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 5
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: xxx
+profile_infer: false
+raw_data_dir: data/raw/aquapre
+ref_norm_layer: bn
+rel_pos: true
+reset_phone_dict: true
+residual_channels: 512
+residual_layers: 20
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- src
+- utils
+save_f0: true
+save_gt: false
+schedule_type: linear
+seed: 1234
+sort_by_len: true
+speaker_id: aqua
+spec_max:
+- 0.18377557396888733
+- -0.33469653129577637
+- -0.3073468506336212
+- -0.21027648448944092
+- 0.23178215324878693
+- 0.5297451019287109
+- 0.7021887898445129
+- 0.7711099982261658
+- 0.7912386059761047
+- 0.6609739065170288
+- 0.649876058101654
+- 0.6327046751976013
+- 0.6892049908638
+- 0.6026111841201782
+- 0.6834777593612671
+- 0.7417489886283875
+- 0.6040375828742981
+- 0.5854794383049011
+- 0.7123280167579651
+- 0.5886657238006592
+- 0.6135984063148499
+- 0.5388530492782593
+- 0.5932422280311584
+- 0.535581111907959
+- 0.57913738489151
+- 0.6827316880226135
+- 0.6265526413917542
+- 0.6557696461677551
+- 0.6586976647377014
+- 0.5687282085418701
+- 0.6218562722206116
+- 0.6349128484725952
+- 0.6176865100860596
+- 0.6212958097457886
+- 0.6277656555175781
+- 0.5551338195800781
+- 0.6126622557640076
+- 0.5821346640586853
+- 0.577056348323822
+- 0.5649800300598145
+- 0.5984634757041931
+- 0.4873456656932831
+- 0.47209471464157104
+- 0.4387756586074829
+- 0.4690910577774048
+- 0.4616055190563202
+- 0.3555675446987152
+- 0.3898852467536926
+- 0.3676068186759949
+- 0.4632047414779663
+- 0.37983986735343933
+- 0.3877682685852051
+- 0.3099276125431061
+- 0.3261813223361969
+- 0.34168118238449097
+- 0.3004901111125946
+- 0.3512653112411499
+- 0.2647061347961426
+- 0.2685043215751648
+- 0.20390087366104126
+- 0.1825377196073532
+- 0.22067485749721527
+- 0.20306138694286346
+- 0.12710601091384888
+- 0.10927848517894745
+- 0.1117628887295723
+- 0.14148156344890594
+- 0.122605100274086
+- 0.08032718300819397
+- 0.12159623205661774
+- -0.04923255369067192
+- -0.07824847847223282
+- 0.03441360592842102
+- 0.07093964517116547
+- -0.1269683688879013
+- 0.0027632638812065125
+- -0.045093610882759094
+- -0.04115259647369385
+- 0.029067598283290863
+- -0.009453626349568367
+- -0.0470033697783947
+- -0.04894810542464256
+- -0.06236470118165016
+- -0.20086997747421265
+- -0.2363593578338623
+- -0.17289961874485016
+- -0.219277486205101
+- -0.2934815585613251
+- -0.30551621317863464
+- -0.2513120770454407
+- -0.26792851090431213
+- -0.33068278431892395
+- -0.37532031536102295
+- -0.365634560585022
+- -0.3379015326499939
+- -0.26979681849479675
+- -0.20316314697265625
+- -0.2109878957271576
+- -0.16927000880241394
+- -0.1698305308818817
+- -0.2739156186580658
+- -0.2700604200363159
+- -0.32284122705459595
+- -0.44529229402542114
+- -0.4002469480037689
+- -0.2441970407962799
+- -0.19795942306518555
+- -0.2462945580482483
+- -0.0673084482550621
+- -0.22117790579795837
+- -0.21418607234954834
+- -0.39467209577560425
+- -0.4388139843940735
+- -0.3227368891239166
+- -0.30530503392219543
+- -0.3201104998588562
+- -0.39839836955070496
+- -0.464596688747406
+- -0.5399728417396545
+- -0.5515261292457581
+- -0.520453691482544
+- -0.6714966893196106
+- -0.6414765119552612
+- -0.6108742356300354
+- -0.6762520670890808
+- -0.7067146301269531
+- -0.7586700320243835
+- -0.6640384793281555
+spec_min:
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.999994277954102
+- -4.989471912384033
+- -4.999994277954102
+spk_cond_steps: []
+stop_token_weight: 5.0
+task_cls: training.task.SVC_task.SVCTask
+test_ids: []
+test_input_dir: ''
+test_num: 0
+test_prefixes:
+- test
+test_set_name: test
+timesteps: 1000
+train_set_name: train
+use_cn_hubert: false
+use_crepe: true
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_midi: false
+use_nsf: true
+use_pitch_embed: true
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: false
+use_var_enc: false
+use_vec: false
+val_check_interval: 2000
+valid_num: 0
+valid_set_name: valid
+vocoder: network.vocoders.nsf_hifigan.NsfHifiGAN
+vocoder_ckpt: checkpoints/nsf_hifigan/model
+warmup_updates: 2000
+wav2spec_eps: 1e-6
+weight_decay: 0
+win_size: 2048
+work_dir: checkpoints/aquapre

checkpoints/0102_xiaoma_pe/config.yaml ADDED Viewed

	@@ -0,0 +1,172 @@

+accumulate_grad_batches: 1
+audio_num_mel_bins: 80
+audio_sample_rate: 24000
+base_config:
+- configs/tts/lj/fs2.yaml
+binarization_args:
+  shuffle: false
+  with_align: true
+  with_f0: true
+  with_f0cwt: true
+  with_spk_embed: true
+  with_txt: true
+  with_wav: false
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+binary_data_dir: data/binary/xiaoma1022_24k_128hop
+check_val_every_n_epoch: 10
+clip_grad_norm: 1
+cwt_add_f0_loss: false
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_std_scale: 0.8
+debug: false
+dec_ffn_kernel_size: 9
+dec_layers: 4
+decoder_type: fft
+dict_dir: ''
+dropout: 0.1
+ds_workers: 4
+dur_enc_hidden_stride_kernel:
+- 0,2,3
+- 0,2,3
+- 0,1,3
+dur_loss: mse
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+enc_ffn_kernel_size: 9
+enc_layers: 4
+encoder_K: 8
+encoder_type: fft
+endless_ds: true
+ffn_act: gelu
+ffn_padding: SAME
+fft_size: 512
+fmax: 12000
+fmin: 30
+gen_dir_name: ''
+hidden_size: 256
+hop_size: 128
+infer: false
+lambda_commit: 0.25
+lambda_energy: 0.1
+lambda_f0: 1.0
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_uv: 1.0
+lambda_word_dur: 1.0
+load_ckpt: ''
+log_interval: 100
+loud_norm: false
+lr: 2.0
+max_epochs: 1000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+max_frames: 5000
+max_input_tokens: 1550
+max_sentences: 100000
+max_tokens: 20000
+max_updates: 60000
+mel_loss: l1
+mel_vmax: 1.5
+mel_vmin: -6
+min_level_db: -120
+norm_type: gn
+num_ckpt_keep: 3
+num_heads: 2
+num_sanity_val_steps: 5
+num_spk: 1
+num_test_samples: 20
+num_valid_plots: 10
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+out_wav_norm: false
+pitch_ar: false
+pitch_enc_hidden_stride_kernel:
+- 0,2,5
+- 0,2,5
+- 0,2,5
+pitch_extractor_conv_layers: 2
+pitch_loss: l1
+pitch_norm: log
+pitch_type: frame
+pre_align_args:
+  allow_no_txt: false
+  denoise: false
+  forced_align: mfa
+  txt_processor: en
+  use_sox: false
+  use_tone: true
+pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
+predictor_dropout: 0.5
+predictor_grad: 0.1
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+pretrain_fs_ckpt: ''
+processed_data_dir: data/processed/ljspeech
+profile_infer: false
+raw_data_dir: data/raw/LJSpeech-1.1
+ref_norm_layer: bn
+reset_phone_dict: true
+save_best: false
+save_ckpt: true
+save_codes:
+- configs
+- modules
+- tasks
+- utils
+- usr
+save_f0: false
+save_gt: false
+seed: 1234
+sort_by_len: true
+stop_token_weight: 5.0
+task_cls: tasks.tts.pe.PitchExtractionTask
+test_ids:
+- 68
+- 70
+- 74
+- 87
+- 110
+- 172
+- 190
+- 215
+- 231
+- 294
+- 316
+- 324
+- 402
+- 422
+- 485
+- 500
+- 505
+- 508
+- 509
+- 519
+test_input_dir: ''
+test_num: 523
+test_set_name: test
+train_set_name: train
+use_denoise: false
+use_energy_embed: false
+use_gt_dur: false
+use_gt_f0: false
+use_pitch_embed: true
+use_pos_embed: true
+use_spk_embed: false
+use_spk_id: false
+use_split_spk_id: false
+use_uv: true
+use_var_enc: false
+val_check_interval: 2000
+valid_num: 348
+valid_set_name: valid
+vocoder: pwg
+vocoder_ckpt: ''
+warmup_updates: 2000
+weight_decay: 0
+win_size: 512
+work_dir: checkpoints/0102_xiaoma_pe

checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1863f12324e43783089ab933edeeb969106b851e30d71019ebbaa9b82099d82a
+size 39141959

checkpoints/hubert/hubert.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c72bad89da99152077bf8157ff75beca7c6dc966ea01a6a0fb3777f99e77aa9b
+size 378353321

checkpoints/hubert/hubert_soft.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e82e7d079df05fe3aa535f6f7d42d309bdae1d2a53324e2b2386c56721f4f649
+size 378435957

checkpoints/nsf_hifigan/NOTICE.txt ADDED Viewed

	@@ -0,0 +1,74 @@

+--- DiffSinger Community Vocoder ---
+ARCHITECTURE: NSF-HiFiGAN
+RELEASE DATE: 2022-12-11
+HYPER PARAMETERS:
+ - 44100 sample rate
+ - 128 mel bins
+ - 512 hop size
+ - 2048 window size
+ - fmin at 40Hz
+ - fmax at 16000Hz
+NOTICE:
+All model weights in the [DiffSinger Community Vocoder Project](https://openvpi.github.io/vocoders/), including
+model weights in this directory, are provided by the [OpenVPI Team](https://github.com/openvpi/), under the
+[Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/) license.
+ACKNOWLEDGEMENTS:
+Training data of this vocoder is provided and permitted by the following organizations, societies and individuals:
+孙飒              https://www.qfssr.cn
+赤松_Akamatsu     https://www.zhibin.club
+乐威              https://www.zhibin.club
+伯添              https://space.bilibili.com/24087011
+雲宇光             https://space.bilibili.com/660675050
+橙子言             https://space.bilibili.com/318486464
+人衣大人           https://space.bilibili.com/2270344
+玖蝶              https://space.bilibili.com/676771003
+Yuuko
+白夜零BYL          https://space.bilibili.com/1605040503
+嗷天              https://space.bilibili.com/5675252
+洛泠羽            https://space.bilibili.com/347373318
+灰条纹的灰猫君      https://space.bilibili.com/2083633
+幽寂              https://space.bilibili.com/478860
+恶魔王女           https://space.bilibili.com/2475098
+AlexYHX 芮晴
+绮萱              https://y.qq.com/n/ryqq/singer/003HjD6H4aZn1K
+诗芸              https://y.qq.com/n/ryqq/singer/0005NInj142zm0
+汐蕾              https://y.qq.com/n/ryqq/singer/0023cWMH1Bq1PJ
+1262917464
+炜阳
+叶卡yolka
+幸の夏            https://space.bilibili.com/1017297686
+暮色未量           https://space.bilibili.com/272904686
+晓寞sama          https://space.bilibili.com/3463394
+没头绪的节操君
+串串BunC          https://space.bilibili.com/95817834
+落雨              https://space.bilibili.com/1292427
+长尾巴的翎艾        https://space.bilibili.com/1638666
+声闻计划           https://space.bilibili.com/392812269
+唐家大小姐         http://5sing.kugou.com/palmusic/default.html
+不伊子
+Training machines are provided by:
+花儿不哭           https://space.bilibili.com/5760446
+TERMS OF REDISTRIBUTIONS:
+1. Do not sell this vocoder, or charge any fees from redistributing it, as prohibited by
+   the license.
+2. Include a copy of the CC BY-NC-SA 4.0 license, or a link referring to it.
+3. Include a copy of this notice, or any other notices informing that this vocoder is
+   provided by the OpenVPI Team, that this vocoder is licensed under CC BY-NC-SA 4.0, and
+   with a complete acknowledgement list as shown above.
+4. If you fine-tuned or modified the weights, leave a notice about what has been changed.
+5. (Optional) Leave a link to the official release page of the vocoder, and tell users
+   that other versions and future updates of this vocoder can be obtained from the website.

checkpoints/nsf_hifigan/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "resblock": "1",
+    "num_gpus": 4,
+    "batch_size": 10,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates":        [ 8, 8, 2, 2, 2],
+    "upsample_kernel_sizes": [16,16, 4, 4, 4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
+    "segment_size": 16384,
+    "num_mels": 128,
+    "num_freq": 1025,
+    "n_fft"   : 2048,
+    "hop_size": 512,
+    "win_size": 2048,
+    "sampling_rate": 44100,
+    "fmin": 40,
+    "fmax": 16000,
+    "fmax_for_loss": null,
+    "num_workers": 16,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

checkpoints/nsf_hifigan/model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c576b63b7ed952161b70fad34e0562ace502ce689195520d8a2a6c051de29d6
+size 56825430

infer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import io
+from pathlib import Path
+import numpy as np
+import soundfile
+from infer_tools import infer_tool
+from infer_tools import slicer
+from infer_tools.infer_tool import Svc
+from utils.hparams import hparams
+def run_clip(raw_audio_path, svc_model, key, acc, use_crepe, spk_id=0, auto_key=False, out_path=None, slice_db=-40,
+             **kwargs):
+    print(f'code version:2023-01-22')
+    clean_name = Path(raw_audio_path).name.split(".")[0]
+    infer_tool.format_wav(raw_audio_path)
+    wav_path = Path(raw_audio_path).with_suffix('.wav')
+    key = svc_model.evaluate_key(wav_path, key, auto_key)
+    chunks = slicer.cut(wav_path, db_thresh=slice_db)
+    audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
+    count = 0
+    f0_tst, f0_pred, audio = [], [], []
+    for (slice_tag, data) in audio_data:
+        print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
+        length = int(np.ceil(len(data) / audio_sr * hparams['audio_sample_rate']))
+        raw_path = io.BytesIO()
+        soundfile.write(raw_path, data, audio_sr, format="wav")
+        raw_path.seek(0)
+        if slice_tag:
+            print('jump empty segment')
+            _f0_tst, _f0_pred, _audio = (
+                np.zeros(int(np.ceil(length / hparams['hop_size']))),
+                np.zeros(int(np.ceil(length / hparams['hop_size']))),
+                np.zeros(length))
+        else:
+            _f0_tst, _f0_pred, _audio = svc_model.infer(raw_path, spk_id=spk_id, key=key, acc=acc, use_crepe=use_crepe)
+        fix_audio = np.zeros(length)
+        fix_audio[:] = np.mean(_audio)
+        fix_audio[:len(_audio)] = _audio[0 if len(_audio) < len(fix_audio) else len(_audio) - len(fix_audio):]
+        f0_tst.extend(_f0_tst)
+        f0_pred.extend(_f0_pred)
+        audio.extend(list(fix_audio))
+        count += 1
+    if out_path is None:
+        out_path = f'./results/{clean_name}_{key}key_{project_name}_{hparams["residual_channels"]}_{hparams["residual_layers"]}_{int(step / 1000)}k_{accelerate}x.{kwargs["format"]}'
+    soundfile.write(out_path, audio, hparams["audio_sample_rate"], 'PCM_16', format=out_path.split('.')[-1])
+    return np.array(f0_tst), np.array(f0_pred), audio
+if __name__ == '__main__':
+    # 工程文件夹名，训练时用的那个
+    project_name = "open-aqua"
+    model_path = f'./checkpoints/{project_name}/model_ckpt_steps_90000.ckpt'
+    config_path = f'./checkpoints/{project_name}/config.yaml'
+    # 支持多个wav/ogg文件，放在raw文件夹下，带扩展名
+    file_names = ["横竖撇点折-main-2key.wav"]
+    spk_id = "single"
+    # 自适应变调（仅支持单人模型）
+    auto_key = False
+    trans = [0]  # 音高调整，支持正负（半音），数量与上一行对应，不足的自动按第一个移调参数补齐
+    # 加速倍数
+    accelerate = 1
+    hubert_gpu = True
+    wav_format = 'wav'
+    step = int(model_path.split("_")[-1].split(".")[0])
+    # 下面不动
+    infer_tool.mkdir(["./raw", "./results"])
+    infer_tool.fill_a_to_b(trans, file_names)
+    model = Svc(project_name, config_path, hubert_gpu, model_path, onnx=False)
+    for f_name, tran in zip(file_names, trans):
+        if "." not in f_name:
+            f_name += ".wav"
+        audio_path = f"./raw/{f_name}"
+        run_clip(raw_audio_path=audio_path, svc_model=model, key=tran, acc=accelerate, use_crepe=False,
+                 spk_id=spk_id, auto_key=auto_key, project_name=project_name, format=wav_format)

infer_tools/__pycache__/f0_static.cpython-38.pyc ADDED Viewed

Binary file (5.12 kB). View file

infer_tools/__pycache__/infer_tool.cpython-38.pyc ADDED Viewed

Binary file (7.26 kB). View file

infer_tools/__pycache__/infer_tool_beta.cpython-38.pyc ADDED Viewed

Binary file (7.8 kB). View file

infer_tools/__pycache__/slicer.cpython-38.pyc ADDED Viewed

Binary file (3.84 kB). View file

infer_tools/__pycache__/trans_key.cpython-38.pyc ADDED Viewed

Binary file (2 kB). View file

infer_tools/f0_static.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import json
+import os
+import shutil
+from functools import reduce
+from pathlib import Path
+import matplotlib
+import matplotlib.pyplot as plt
+import yaml
+from pylab import xticks, np
+from tqdm import tqdm
+from modules.vocoders.nsf_hifigan import NsfHifiGAN
+from preprocessing.process_pipeline import get_pitch_parselmouth, get_pitch_crepe
+from utils.hparams import set_hparams, hparams
+head_list = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+def compare_pitch(f0_static_dict, pitch_time_temp, trans_key=0):
+    return sum({k: v * f0_static_dict[str(k + trans_key)] for k, v in pitch_time_temp.items() if
+                str(k + trans_key) in f0_static_dict}.values())
+def f0_to_pitch(ff):
+    f0_pitch = 69 + 12 * np.log2(ff / 440)
+    return round(f0_pitch, 0)
+def pitch_to_name(pitch):
+    return f"{head_list[int(pitch % 12)]}{int(pitch / 12) - 1}"
+def get_f0(audio_path, crepe=False):
+    wav, mel = NsfHifiGAN.wav2spec(audio_path)
+    if crepe:
+        f0, pitch_coarse = get_pitch_crepe(wav, mel, hparams)
+    else:
+        f0, pitch_coarse = get_pitch_parselmouth(wav, mel, hparams)
+    return f0
+def merge_f0_dict(dict_list):
+    def sum_dict(a, b):
+        temp = dict()
+        for key in a.keys() | b.keys():
+            temp[key] = sum([d.get(key, 0) for d in (a, b)])
+        return temp
+    return reduce(sum_dict, dict_list)
+def collect_f0(f0):
+    pitch_num = {}
+    pitch_list = [f0_to_pitch(x) for x in f0[f0 > 0]]
+    for key in pitch_list:
+        pitch_num[key] = pitch_num.get(key, 0) + 1
+    return pitch_num
+def static_f0_time(f0):
+    if isinstance(f0, dict):
+        pitch_num = merge_f0_dict({k: collect_f0(v) for k, v in f0.items()}.values())
+    else:
+        pitch_num = collect_f0(f0)
+    static_pitch_time = {}
+    sort_key = sorted(pitch_num.keys())
+    for key in sort_key:
+        static_pitch_time[key] = round(pitch_num[key] * hparams['hop_size'] / hparams['audio_sample_rate'], 2)
+    return static_pitch_time
+def get_end_file(dir_path, end):
+    file_lists = []
+    for root, dirs, files in os.walk(dir_path):
+        files = [f for f in files if f[0] != '.']
+        dirs[:] = [d for d in dirs if d[0] != '.']
+        for f_file in files:
+            if f_file.endswith(end):
+                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
+    return file_lists
+if __name__ == "__main__":
+    # 给config文件增加f0_static统计音域
+    config_path = "F:/sovits/diff-svc-main/checkpoints/aquapre/config.yaml"
+    hparams = set_hparams(config=config_path, exp_name='', infer=True, reset=True, hparams_str='', print_hparams=False)
+    f0_dict = {}
+    # 获取batch文件夹下所有wav文件
+    wav_paths = get_end_file("F:/sovits/diff-svc-main/batch/aquapre", "wav")
+    # parselmouth获取f0
+    with tqdm(total=len(wav_paths)) as p_bar:
+        p_bar.set_description('Processing')
+        for wav_path in wav_paths:
+            f0_dict[wav_path] = get_f0(wav_path, crepe=False)
+            p_bar.update(1)
+    pitch_time = static_f0_time(f0_dict)
+    total_time = round(sum(pitch_time.values()), 2)
+    pitch_time["total_time"] = total_time
+    print(f"total time: {total_time}s")
+    shutil.copy(config_path, f"{Path(config_path).parent}\\back_{Path(config_path).name}")
+    with open(config_path, encoding='utf-8') as f:
+        _hparams = yaml.safe_load(f)
+        _hparams['f0_static'] = json.dumps(pitch_time)
+    with open(config_path, 'w', encoding='utf-8') as f:
+        yaml.safe_dump(_hparams, f)
+        print("原config文件已在原目录建立备份：back_config.yaml")
+        print("音域统计已保存至config文件，此模型可使用自动变调功能")
+    matplotlib.use('TkAgg')
+    plt.title("数据集音域统计", fontproperties='SimHei')
+    plt.xlabel("音高", fontproperties='SimHei')
+    plt.ylabel("时长(s)", fontproperties='SimHei')
+    xticks_labels = [pitch_to_name(i) for i in range(36, 96)]
+    xticks(np.linspace(36, 96, 60, endpoint=True), xticks_labels)
+    plt.plot(pitch_time.keys(), pitch_time.values(), color='dodgerblue')
+    plt.show()

infer_tools/f0_temp.json ADDED Viewed

The diff for this file is too large to render. See raw diff

infer_tools/infer_tool.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import json
+import os
+import time
+from io import BytesIO
+from pathlib import Path
+import librosa
+import numpy as np
+import soundfile
+import torch
+import utils
+from infer_tools.f0_static import compare_pitch, static_f0_time
+from modules.diff.diffusion import GaussianDiffusion
+from modules.diff.net import DiffNet
+from modules.vocoders.nsf_hifigan import NsfHifiGAN
+from preprocessing.hubertinfer import HubertEncoder
+from preprocessing.process_pipeline import File2Batch, get_pitch_parselmouth
+from utils.hparams import hparams, set_hparams
+from utils.pitch_utils import denorm_f0, norm_interp_f0
+def timeit(func):
+    def run(*args, **kwargs):
+        t = time.time()
+        res = func(*args, **kwargs)
+        print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
+        return res
+    return run
+def format_wav(audio_path):
+    if Path(audio_path).suffix == '.wav':
+        return
+    raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
+    soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
+def fill_a_to_b(a, b):
+    if len(a) < len(b):
+        for _ in range(0, len(b) - len(a)):
+            a.append(a[0])
+def get_end_file(dir_path, end):
+    file_lists = []
+    for root, dirs, files in os.walk(dir_path):
+        files = [f for f in files if f[0] != '.']
+        dirs[:] = [d for d in dirs if d[0] != '.']
+        for f_file in files:
+            if f_file.endswith(end):
+                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
+    return file_lists
+def mkdir(paths: list):
+    for path in paths:
+        if not os.path.exists(path):
+            os.mkdir(path)
+class Svc:
+    def __init__(self, project_name, config_name, hubert_gpu, model_path, onnx=False):
+        self.project_name = project_name
+        self.DIFF_DECODERS = {
+            'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
+        }
+        self.model_path = model_path
+        self.dev = torch.device("cuda")
+        self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
+                             reset=True, hparams_str='', print_hparams=False)
+        hparams['hubert_gpu'] = hubert_gpu
+        self.hubert = HubertEncoder(hparams['hubert_path'], onnx=onnx)
+        self.model = GaussianDiffusion(
+            phone_encoder=self.hubert,
+            out_dims=hparams['audio_num_mel_bins'],
+            denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
+            timesteps=hparams['timesteps'],
+            K_step=hparams['K_step'],
+            loss_type=hparams['diff_loss_type'],
+            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+        )
+        utils.load_ckpt(self.model, self.model_path, 'model', force=True, strict=True)
+        self.model.cuda()
+        self.vocoder = NsfHifiGAN()
+    def infer(self, in_path, key, acc, spk_id=0, use_crepe=True, singer=False):
+        batch = self.pre(in_path, acc, spk_id, use_crepe)
+        batch['f0'] = batch['f0'] + (key / 12)
+        batch['f0'][batch['f0'] > np.log2(hparams['f0_max'])] = 0
+        @timeit
+        def diff_infer():
+            spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
+            energy = batch.get('energy').cuda() if batch.get('energy') else None
+            if spk_embed is None:
+                spk_embed = torch.LongTensor([0])
+            diff_outputs = self.model(
+                hubert=batch['hubert'].cuda(), spk_embed_id=spk_embed.cuda(), mel2ph=batch['mel2ph'].cuda(),
+                f0=batch['f0'].cuda(), energy=energy, ref_mels=batch["mels"].cuda(), infer=True)
+            return diff_outputs
+        outputs = diff_infer()
+        batch['outputs'] = outputs['mel_out']
+        batch['mel2ph_pred'] = outputs['mel2ph']
+        batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
+        batch['f0_pred'] = outputs.get('f0_denorm')
+        return self.after_infer(batch, singer, in_path)
+    @timeit
+    def after_infer(self, prediction, singer, in_path):
+        for k, v in prediction.items():
+            if type(v) is torch.Tensor:
+                prediction[k] = v.cpu().numpy()
+        # remove paddings
+        mel_gt = prediction["mels"]
+        mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
+        mel_pred = prediction["outputs"]
+        mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
+        mel_pred = mel_pred[mel_pred_mask]
+        mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
+        f0_gt = prediction.get("f0_gt")
+        f0_pred = prediction.get("f0_pred")
+        if f0_pred is not None:
+            f0_gt = f0_gt[mel_gt_mask]
+        if len(f0_pred) > len(mel_pred_mask):
+            f0_pred = f0_pred[:len(mel_pred_mask)]
+        f0_pred = f0_pred[mel_pred_mask]
+        torch.cuda.is_available() and torch.cuda.empty_cache()
+        if singer:
+            data_path = in_path.replace("batch", "singer_data")
+            mel_path = data_path[:-4] + "_mel.npy"
+            f0_path = data_path[:-4] + "_f0.npy"
+            np.save(mel_path, mel_pred)
+            np.save(f0_path, f0_pred)
+        wav_pred = self.vocoder.spec2wav(mel_pred, f0=f0_pred)
+        return f0_gt, f0_pred, wav_pred
+    def pre(self, wav_fn, accelerate, spk_id=0, use_crepe=True):
+        if isinstance(wav_fn, BytesIO):
+            item_name = self.project_name
+        else:
+            song_info = wav_fn.split('/')
+            item_name = song_info[-1].split('.')[-2]
+        temp_dict = {'wav_fn': wav_fn, 'spk_id': spk_id, 'id': 0}
+        temp_dict = File2Batch.temporary_dict2processed_input(item_name, temp_dict, self.hubert, infer=True,
+                                                              use_crepe=use_crepe)
+        hparams['pndm_speedup'] = accelerate
+        batch = File2Batch.processed_input2batch([getitem(temp_dict)])
+        return batch
+    def evaluate_key(self, wav_path, key, auto_key):
+        if "f0_static" in hparams.keys():
+            f0_static = json.loads(hparams['f0_static'])
+            wav, mel = self.vocoder.wav2spec(wav_path)
+            input_f0 = get_pitch_parselmouth(wav, mel, hparams)[0]
+            pitch_time_temp = static_f0_time(input_f0)
+            eval_dict = {}
+            for trans_key in range(-12, 12):
+                eval_dict[trans_key] = compare_pitch(f0_static, pitch_time_temp, trans_key=trans_key)
+            sort_key = sorted(eval_dict, key=eval_dict.get, reverse=True)[:5]
+            print(f"推荐移调:{sort_key}")
+            if auto_key:
+                print(f"自动变调已启用，您的输入key被{sort_key[0]}key覆盖，控制参数为auto_key")
+                return sort_key[0]
+        else:
+            print("config缺少f0_staic，无法使用自动变调，可通过infer_tools/data_static添加")
+        return key
+def getitem(item):
+    max_frames = hparams['max_frames']
+    spec = torch.Tensor(item['mel'])[:max_frames]
+    mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None
+    f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
+    hubert = torch.Tensor(item['hubert'][:hparams['max_input_tokens']])
+    pitch = torch.LongTensor(item.get("pitch"))[:max_frames]
+    sample = {
+        "id": item['id'],
+        "spk_id": item['spk_id'],
+        "item_name": item['item_name'],
+        "hubert": hubert,
+        "mel": spec,
+        "pitch": pitch,
+        "f0": f0,
+        "uv": uv,
+        "mel2ph": mel2ph,
+        "mel_nonpadding": spec.abs().sum(-1) > 0,
+    }
+    if hparams['use_energy_embed']:
+        sample['energy'] = item['energy']
+    return sample

infer_tools/infer_tool_beta.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import json
+import os
+import time
+from io import BytesIO
+from pathlib import Path
+import librosa
+import numpy as np
+import soundfile
+import torch
+import utils
+from infer_tools.f0_static import compare_pitch, static_f0_time
+from modules.diff.diffusion import GaussianDiffusion
+from modules.diff.net import DiffNet
+from modules.vocoders.nsf_hifigan import NsfHifiGAN
+from preprocessing.hubertinfer import HubertEncoder
+from preprocessing.process_pipeline import File2Batch, get_pitch_parselmouth
+from utils.hparams import hparams, set_hparams
+from utils.pitch_utils import denorm_f0, norm_interp_f0
+def timeit(func):
+    def run(*args, **kwargs):
+        t = time.time()
+        res = func(*args, **kwargs)
+        print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
+        return res
+    return run
+def format_wav(audio_path):
+    if Path(audio_path).suffix == '.wav':
+        return
+    raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
+    soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
+def fill_a_to_b(a, b):
+    if len(a) < len(b):
+        for _ in range(0, len(b) - len(a)):
+            a.append(a[0])
+def get_end_file(dir_path, end):
+    file_lists = []
+    for root, dirs, files in os.walk(dir_path):
+        files = [f for f in files if f[0] != '.']
+        dirs[:] = [d for d in dirs if d[0] != '.']
+        for f_file in files:
+            if f_file.endswith(end):
+                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
+    return file_lists
+def mkdir(paths: list):
+    for path in paths:
+        if not os.path.exists(path):
+            os.mkdir(path)
+class Svcb:
+    def __init__(self, project_name, config_name, hubert_gpu, model_path, onnx=False):
+        self.project_name = project_name
+        self.DIFF_DECODERS = {
+            'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
+        }
+        self.model_path = model_path
+        self.dev = torch.device("cuda")
+        self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
+                             reset=True, hparams_str='', print_hparams=False)
+        self.mel_bins = hparams['audio_num_mel_bins']
+        hparams['hubert_gpu'] = hubert_gpu
+        self.hubert = HubertEncoder(hparams['hubert_path'], onnx=onnx)
+        self.model = GaussianDiffusion(
+            phone_encoder=self.hubert,
+            out_dims=self.mel_bins, denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
+            timesteps=hparams['timesteps'],
+            K_step=hparams['K_step'],
+            loss_type=hparams['diff_loss_type'],
+            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+        )
+        utils.load_ckpt(self.model, self.model_path, 'model', force=True, strict=True)
+        self.model.cuda()
+        self.vocoder = NsfHifiGAN()
+    # def process_batch_f0(batch_f0, hparams):
+    #     pitch_num = collect_f0(batch_f0)
+    #     pitch_time = {}
+    #     sort_key = sorted(pitch_num.keys())
+    #     for key in sort_key:
+    #         pitch_time[key] = round(pitch_num[key] * hparams['hop_size'] / hparams['audio_sample_rate'], 2)
+    #     return pitch_time
+    def infer_autokey(self, in_path, key, acc, spk_id=0, use_crepe=False):
+        batch, temp_dict = self.pre(in_path, acc, spk_id, use_crepe)
+        input_f0 = temp_dict['f0']
+        if "f0_static" in hparams.keys():
+            f0_static = json.loads(hparams['f0_static'])
+            pitch_time_temp = static_f0_time(input_f0)
+            eval_dict = {}
+            for trans_key in range(-12, 12):
+                eval_dict[trans_key] = compare_pitch(f0_static, pitch_time_temp, trans_key=trans_key)
+            sort_key = sorted(eval_dict, key=eval_dict.get, reverse=True)[:5]
+            print(f"推荐移调:{sort_key}")
+            print(f"自动变调已启用，您的输入key被{sort_key[0]}key覆盖，控制参数为auto_key")
+            if sort_key[0] > 6:
+                key = sort_key[0] + 6
+            else:
+                key = sort_key[0]
+        return key, in_path, batch
+    # def infer(self, in_path, key, acc, spk_id=0, use_crepe=True, singer=False):
+    #     batch = self.pre(in_path, acc, spk_id, use_crepe)
+    def infer(self, in_path, key, batch, singer=False):
+        batch['f0'] = batch['f0'] + (key / 12)
+        batch['f0'][batch['f0'] > np.log2(hparams['f0_max'])] = 0
+        @timeit
+        def diff_infer():
+            spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
+            energy = batch.get('energy').cuda() if batch.get('energy') else None
+            if spk_embed is None:
+                spk_embed = torch.LongTensor([0])
+            diff_outputs = self.model(
+                hubert=batch['hubert'].cuda(), spk_embed_id=spk_embed.cuda(), mel2ph=batch['mel2ph'].cuda(),
+                f0=batch['f0'].cuda(), energy=energy, ref_mels=batch["mels"].cuda(), infer=True)
+            return diff_outputs
+        outputs = diff_infer()
+        batch['outputs'] = outputs['mel_out']
+        batch['mel2ph_pred'] = outputs['mel2ph']
+        batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
+        batch['f0_pred'] = outputs.get('f0_denorm')
+        return self.after_infer(batch, singer, in_path)
+    @timeit
+    def after_infer(self, prediction, singer, in_path):
+        for k, v in prediction.items():
+            if type(v) is torch.Tensor:
+                prediction[k] = v.cpu().numpy()
+        # remove paddings
+        mel_gt = prediction["mels"]
+        mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
+        mel_pred = prediction["outputs"]
+        mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
+        mel_pred = mel_pred[mel_pred_mask]
+        mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
+        f0_gt = prediction.get("f0_gt")
+        f0_pred = prediction.get("f0_pred")
+        if f0_pred is not None:
+            f0_gt = f0_gt[mel_gt_mask]
+        if len(f0_pred) > len(mel_pred_mask):
+            f0_pred = f0_pred[:len(mel_pred_mask)]
+        f0_pred = f0_pred[mel_pred_mask]
+        torch.cuda.is_available() and torch.cuda.empty_cache()
+        if singer:
+            data_path = in_path.replace("batch", "singer_data")
+            mel_path = data_path[:-4] + "_mel.npy"
+            f0_path = data_path[:-4] + "_f0.npy"
+            np.save(mel_path, mel_pred)
+            np.save(f0_path, f0_pred)
+        wav_pred = self.vocoder.spec2wav(mel_pred, f0=f0_pred)
+        return f0_gt, f0_pred, wav_pred
+    def pre(self, wav_fn, accelerate, spk_id=0, use_crepe=True):
+        if isinstance(wav_fn, BytesIO):
+            item_name = self.project_name
+        else:
+            song_info = wav_fn.split('/')
+            item_name = song_info[-1].split('.')[-2]
+        temp_dict = {'wav_fn': wav_fn, 'spk_id': spk_id, 'id': 0}
+        temp_dict = File2Batch.temporary_dict2processed_input(item_name, temp_dict, self.hubert, infer=True,
+                                                              use_crepe=use_crepe)
+        hparams['pndm_speedup'] = accelerate
+        batch = File2Batch.processed_input2batch([getitem(temp_dict)])
+        return batch, temp_dict
+    def evaluate_key(self, wav_path, key, auto_key):
+        if "f0_static" in hparams.keys():
+            f0_static = json.loads(hparams['f0_static'])
+            wav, mel = self.vocoder.wav2spec(wav_path)
+            input_f0 = get_pitch_parselmouth(wav, mel, hparams)[0]
+            pitch_time_temp = static_f0_time(input_f0)
+            eval_dict = {}
+            for trans_key in range(-12, 12):
+                eval_dict[trans_key] = compare_pitch(f0_static, pitch_time_temp, trans_key=trans_key)
+            sort_key = sorted(eval_dict, key=eval_dict.get, reverse=True)[:5]
+            print(f"推荐移调:{sort_key}")
+            if auto_key:
+                print(f"自动变调已启用，您的输入key被{sort_key[0]}key覆盖，控制参数为auto_key")
+                return sort_key[0]
+        else:
+            print("config缺少f0_staic，无法使用自动变调，可通过infer_tools/data_static添加")
+        return key
+def getitem(item):
+    max_frames = hparams['max_frames']
+    spec = torch.Tensor(item['mel'])[:max_frames]
+    mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None
+    f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
+    hubert = torch.Tensor(item['hubert'][:hparams['max_input_tokens']])
+    pitch = torch.LongTensor(item.get("pitch"))[:max_frames]
+    sample = {
+        "id": item['id'],
+        "spk_id": item['spk_id'],
+        "item_name": item['item_name'],
+        "hubert": hubert,
+        "mel": spec,
+        "pitch": pitch,
+        "f0": f0,
+        "uv": uv,
+        "mel2ph": mel2ph,
+        "mel_nonpadding": spec.abs().sum(-1) > 0,
+    }
+    if hparams['use_energy_embed']:
+        sample['energy'] = item['energy']
+    return sample

infer_tools/slicer.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import librosa
+import torch
+import torchaudio
+class Slicer:
+    def __init__(self,
+                 sr: int,
+                 threshold: float = -40.,
+                 min_length: int = 5000,
+                 min_interval: int = 300,
+                 hop_size: int = 20,
+                 max_sil_kept: int = 5000):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
+        if not max_sil_kept >= hop_size:
+            raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+    def _apply_slice(self, waveform, begin, end):
+        if len(waveform.shape) > 1:
+            return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
+        else:
+            return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
+    # @timeit
+    def slice(self, waveform):
+        if len(waveform.shape) > 1:
+            samples = librosa.to_mono(waveform)
+        else:
+            samples = waveform
+        if samples.shape[0] <= self.min_length:
+            return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
+        rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start: i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if silence_start is not None and total_frames - silence_start >= self.min_interval:
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        if len(sil_tags) == 0:
+            return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
+        else:
+            chunks = []
+            # 第一段静音并非从头开始，补上有声片段
+            if sil_tags[0][0]:
+                chunks.append(
+                    {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
+            for i in range(0, len(sil_tags)):
+                # 标识有声片段（跳过第一段）
+                if i:
+                    chunks.append({"slice": False,
+                                   "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
+                # 标识所有静音片段
+                chunks.append({"slice": True,
+                               "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
+            # 最后一段静音并非结尾，补上结尾片段
+            if sil_tags[-1][1] * self.hop_size < len(waveform):
+                chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
+            chunk_dict = {}
+            for i in range(len(chunks)):
+                chunk_dict[str(i)] = chunks[i]
+            return chunk_dict
+def cut(audio_path, db_thresh=-30, min_len=5000):
+    audio, sr = librosa.load(audio_path, sr=None)
+    slicer = Slicer(
+        sr=sr,
+        threshold=db_thresh,
+        min_length=min_len
+    )
+    chunks = slicer.slice(audio)
+    return chunks
+def chunks2audio(audio_path, chunks):
+    chunks = dict(chunks)
+    audio, sr = torchaudio.load(audio_path)
+    if len(audio.shape) == 2 and audio.shape[1] >= 2:
+        audio = torch.mean(audio, dim=0).unsqueeze(0)
+    audio = audio.cpu().numpy()[0]
+    result = []
+    for k, v in chunks.items():
+        tag = v["split_time"].split(",")
+        if tag[0] != tag[1]:
+            result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
+    return result, sr

infer_tools/trans_key.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+head_list = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+def trans_f0_seq(feature_pit, transform):
+    feature_pit = feature_pit * 2 ** (transform / 12)
+    return round(feature_pit, 1)
+def move_key(raw_data, mv_key):
+    head = raw_data[:-1]
+    body = int(raw_data[-1])
+    new_head_index = head_list.index(head) + mv_key
+    while new_head_index < 0:
+        body -= 1
+        new_head_index += 12
+    while new_head_index > 11:
+        body += 1
+        new_head_index -= 12
+    result_data = head_list[new_head_index] + str(body)
+    return result_data
+def trans_key(raw_data, key):
+    for i in raw_data:
+        note_seq_list = i["note_seq"].split(" ")
+        new_note_seq_list = []
+        for note_seq in note_seq_list:
+            if note_seq != "rest":
+                new_note_seq = move_key(note_seq, key)
+                new_note_seq_list.append(new_note_seq)
+            else:
+                new_note_seq_list.append(note_seq)
+        i["note_seq"] = " ".join(new_note_seq_list)
+        f0_seq_list = i["f0_seq"].split(" ")
+        f0_seq_list = [float(x) for x in f0_seq_list]
+        new_f0_seq_list = []
+        for f0_seq in f0_seq_list:
+            new_f0_seq = trans_f0_seq(f0_seq, key)
+            new_f0_seq_list.append(str(new_f0_seq))
+        i["f0_seq"] = " ".join(new_f0_seq_list)
+    return raw_data
+def trans_opencpop(raw_txt, res_txt, key):
+    if os.path.exists(raw_txt):
+        f_w = open(res_txt, "w", encoding='utf-8')
+        with open(raw_txt, "r", encoding='utf-8') as f:
+            raw_data = f.readlines()
+            for raw in raw_data:
+                raw_list = raw.split("|")
+                new_note_seq_list = []
+                for note_seq in raw_list[3].split(" "):
+                    if note_seq != "rest":
+                        note_seq = note_seq.split("/")[0] if "/" in note_seq else note_seq
+                        new_note_seq = move_key(note_seq, key)
+                        new_note_seq_list.append(new_note_seq)
+                    else:
+                        new_note_seq_list.append(note_seq)
+                raw_list[3] = " ".join(new_note_seq_list)
+                f_w.write("|".join(raw_list))
+        f_w.close()
+        print("opencpop标注文件转换完毕")
+    else:
+        print("未发现opencpop标注文件，请检查路径")

modules/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (7.19 kB). View file

modules/__pycache__/encoder.cpython-38.pyc ADDED Viewed

Binary file (7.17 kB). View file

modules/commons/__pycache__/common_layers.cpython-310.pyc ADDED Viewed

Binary file (18.6 kB). View file

modules/commons/__pycache__/common_layers.cpython-38.pyc ADDED Viewed

Binary file (18.9 kB). View file

modules/commons/__pycache__/ssim.cpython-310.pyc ADDED Viewed

Binary file (2.67 kB). View file

modules/commons/__pycache__/ssim.cpython-38.pyc ADDED Viewed

Binary file (2.68 kB). View file

modules/commons/common_layers.py ADDED Viewed

	@@ -0,0 +1,675 @@

+import math
+import torch
+import torch.nn.functional as F
+import torch.onnx.operators
+from torch import nn
+from torch.nn import Parameter
+import utils
+class Reshape(nn.Module):
+    def __init__(self, *args):
+        super(Reshape, self).__init__()
+        self.shape = args
+    def forward(self, x):
+        return x.view(self.shape)
+class Permute(nn.Module):
+    def __init__(self, *args):
+        super(Permute, self).__init__()
+        self.args = args
+    def forward(self, x):
+        return x.permute(self.args)
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert (kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+def Embedding(num_embeddings, embedding_dim, padding_idx=None):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    if padding_idx is not None:
+        nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
+    if not export and torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.)
+    return m
+class SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+    Padding symbols are ignored.
+    """
+    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.weights = SinusoidalPositionalEmbedding.get_embedding(
+            init_size,
+            embedding_dim,
+            padding_idx,
+        )
+        self.register_buffer('_float_tensor', torch.FloatTensor(1))
+    @staticmethod
+    def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
+        """Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+    def forward(self, input, incremental_state=None, timestep=None, positions=None, **kwargs):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input.shape[:2]
+        max_pos = self.padding_idx + 1 + seq_len
+        if self.weights is None or max_pos > self.weights.size(0):
+            # recompute/expand embeddings if needed
+            self.weights = SinusoidalPositionalEmbedding.get_embedding(
+                max_pos,
+                self.embedding_dim,
+                self.padding_idx,
+            )
+        self.weights = self.weights.to(self._float_tensor)
+        if incremental_state is not None:
+            # positions is the same for every token when decoding a single step
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
+        positions = utils.make_positions(input, self.padding_idx) if positions is None else positions
+        return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
+    def max_positions(self):
+        """Maximum number of supported positions."""
+        return int(1e5)  # an arbitrary large number
+class ConvTBC(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
+        super(ConvTBC, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.weight = torch.nn.Parameter(torch.Tensor(
+            self.kernel_size, in_channels, out_channels))
+        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+    def forward(self, input):
+        return torch.conv_tbc(input.contiguous(), self.weight, self.bias, self.padding)
+class MultiheadAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
+                 add_bias_kv=False, add_zero_attn=False, self_attention=False,
+                 encoder_decoder_attention=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
+                                                             'value to be of the same size'
+        if self.qkv_same_dim:
+            self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
+        else:
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+        if bias:
+            self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.reset_parameters()
+        self.enable_torch_version = False
+        if hasattr(F, "multi_head_attention_forward"):
+            self.enable_torch_version = True
+        else:
+            self.enable_torch_version = False
+        self.last_attn_probs = None
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            nn.init.xavier_uniform_(self.in_proj_weight)
+        else:
+            nn.init.xavier_uniform_(self.k_proj_weight)
+            nn.init.xavier_uniform_(self.v_proj_weight)
+            nn.init.xavier_uniform_(self.q_proj_weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.in_proj_bias is not None:
+            nn.init.constant_(self.in_proj_bias, 0.)
+            nn.init.constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+    def forward(
+            self,
+            query, key, value,
+            key_padding_mask=None,
+            incremental_state=None,
+            need_weights=True,
+            static_kv=False,
+            attn_mask=None,
+            before_softmax=False,
+            need_head_weights=False,
+            enc_dec_attn_constraint_mask=None,
+            reset_attn_weight=None
+    ):
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if self.enable_torch_version and incremental_state is None and not static_kv and reset_attn_weight is None:
+            if self.qkv_same_dim:
+                return F.multi_head_attention_forward(query, key, value,
+                                                      self.embed_dim, self.num_heads,
+                                                      self.in_proj_weight,
+                                                      self.in_proj_bias, self.bias_k, self.bias_v,
+                                                      self.add_zero_attn, self.dropout,
+                                                      self.out_proj.weight, self.out_proj.bias,
+                                                      self.training, key_padding_mask, need_weights,
+                                                      attn_mask)
+            else:
+                return F.multi_head_attention_forward(query, key, value,
+                                                      self.embed_dim, self.num_heads,
+                                                      torch.empty([0]),
+                                                      self.in_proj_bias, self.bias_k, self.bias_v,
+                                                      self.add_zero_attn, self.dropout,
+                                                      self.out_proj.weight, self.out_proj.bias,
+                                                      self.training, key_padding_mask, need_weights,
+                                                      attn_mask, use_separate_proj_weight=True,
+                                                      q_proj_weight=self.q_proj_weight,
+                                                      k_proj_weight=self.k_proj_weight,
+                                                      v_proj_weight=self.v_proj_weight)
+        if incremental_state is not None:
+            print('Not implemented error.')
+            exit()
+        else:
+            saved_state = None
+        if self.self_attention:
+            # self-attention
+            q, k, v = self.in_proj_qkv(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.in_proj_q(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.in_proj_k(key)
+                v = self.in_proj_v(key)
+        else:
+            q = self.in_proj_q(query)
+            k = self.in_proj_k(key)
+            v = self.in_proj_v(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if saved_state is not None:
+            print('Not implemented error.')
+            exit()
+        src_len = k.size(1)
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.shape == torch.Size([]):
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            if len(attn_mask.shape) == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+            elif len(attn_mask.shape) == 3:
+                attn_mask = attn_mask[:, None].repeat([1, self.num_heads, 1, 1]).reshape(
+                    bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights + attn_mask
+        if enc_dec_attn_constraint_mask is not None:  # bs x head x L_kv
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                enc_dec_attn_constraint_mask.unsqueeze(2).bool(),
+                -1e9,
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                -1e9,
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_logits = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v
+        attn_weights_float = utils.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
+        if reset_attn_weight is not None:
+            if reset_attn_weight:
+                self.last_attn_probs = attn_probs.detach()
+            else:
+                assert self.last_attn_probs is not None
+                attn_probs = self.last_attn_probs
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        if need_weights:
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        else:
+            attn_weights = None
+        return attn, (attn_weights, attn_logits)
+    def in_proj_qkv(self, query):
+        return self._in_proj(query).chunk(3, dim=-1)
+    def in_proj_q(self, query):
+        if self.qkv_same_dim:
+            return self._in_proj(query, end=self.embed_dim)
+        else:
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[:self.embed_dim]
+            return F.linear(query, self.q_proj_weight, bias)
+    def in_proj_k(self, key):
+        if self.qkv_same_dim:
+            return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
+        else:
+            weight = self.k_proj_weight
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[self.embed_dim:2 * self.embed_dim]
+            return F.linear(key, weight, bias)
+    def in_proj_v(self, value):
+        if self.qkv_same_dim:
+            return self._in_proj(value, start=2 * self.embed_dim)
+        else:
+            weight = self.v_proj_weight
+            bias = self.in_proj_bias
+            if bias is not None:
+                bias = bias[2 * self.embed_dim:]
+            return F.linear(value, weight, bias)
+    def _in_proj(self, input, start=0, end=None):
+        weight = self.in_proj_weight
+        bias = self.in_proj_bias
+        weight = weight[start:end, :]
+        if bias is not None:
+            bias = bias[start:end]
+        return F.linear(input, weight, bias)
+    def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
+        return attn_weights
+class Swish(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_variables[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+class CustomSwish(nn.Module):
+    def forward(self, input_tensor):
+        return Swish.apply(input_tensor)
+class Mish(nn.Module):
+    def forward(self, x):
+        return x * torch.tanh(F.softplus(x))
+class TransformerFFNLayer(nn.Module):
+    def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.dropout = dropout
+        self.act = act
+        if padding == 'SAME':
+            self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
+        elif padding == 'LEFT':
+            self.ffn_1 = nn.Sequential(
+                nn.ConstantPad1d((kernel_size - 1, 0), 0.0),
+                nn.Conv1d(hidden_size, filter_size, kernel_size)
+            )
+        self.ffn_2 = Linear(filter_size, hidden_size)
+        if self.act == 'swish':
+            self.swish_fn = CustomSwish()
+    def forward(self, x, incremental_state=None):
+        # x: T x B x C
+        if incremental_state is not None:
+            assert incremental_state is None, 'Nar-generation does not allow this.'
+            exit(1)
+        x = self.ffn_1(x.permute(1, 2, 0)).permute(2, 0, 1)
+        x = x * self.kernel_size ** -0.5
+        if incremental_state is not None:
+            x = x[-1:]
+        if self.act == 'gelu':
+            x = F.gelu(x)
+        if self.act == 'relu':
+            x = F.relu(x)
+        if self.act == 'swish':
+            x = self.swish_fn(x)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = self.ffn_2(x)
+        return x
+class BatchNorm1dTBC(nn.Module):
+    def __init__(self, c):
+        super(BatchNorm1dTBC, self).__init__()
+        self.bn = nn.BatchNorm1d(c)
+    def forward(self, x):
+        """
+        :param x: [T, B, C]
+        :return: [T, B, C]
+        """
+        x = x.permute(1, 2, 0)  # [B, C, T]
+        x = self.bn(x)  # [B, C, T]
+        x = x.permute(2, 0, 1)  # [T, B, C]
+        return x
+class EncSALayer(nn.Module):
+    def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
+                 relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'):
+        super().__init__()
+        self.c = c
+        self.dropout = dropout
+        self.num_heads = num_heads
+        if num_heads > 0:
+            if norm == 'ln':
+                self.layer_norm1 = LayerNorm(c)
+            elif norm == 'bn':
+                self.layer_norm1 = BatchNorm1dTBC(c)
+            self.self_attn = MultiheadAttention(
+                self.c, num_heads, self_attention=True, dropout=attention_dropout, bias=False,
+            )
+        if norm == 'ln':
+            self.layer_norm2 = LayerNorm(c)
+        elif norm == 'bn':
+            self.layer_norm2 = BatchNorm1dTBC(c)
+        self.ffn = TransformerFFNLayer(
+            c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act)
+    def forward(self, x, encoder_padding_mask=None, **kwargs):
+        layer_norm_training = kwargs.get('layer_norm_training', None)
+        if layer_norm_training is not None:
+            self.layer_norm1.training = layer_norm_training
+            self.layer_norm2.training = layer_norm_training
+        if self.num_heads > 0:
+            residual = x
+            x = self.layer_norm1(x)
+            x, _, = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=encoder_padding_mask
+            )
+            x = F.dropout(x, self.dropout, training=self.training)
+            x = residual + x
+            x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        residual = x
+        x = self.layer_norm2(x)
+        x = self.ffn(x)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
+        return x
+class DecSALayer(nn.Module):
+    def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1, kernel_size=9, act='gelu'):
+        super().__init__()
+        self.c = c
+        self.dropout = dropout
+        self.layer_norm1 = LayerNorm(c)
+        self.self_attn = MultiheadAttention(
+            c, num_heads, self_attention=True, dropout=attention_dropout, bias=False
+        )
+        self.layer_norm2 = LayerNorm(c)
+        self.encoder_attn = MultiheadAttention(
+            c, num_heads, encoder_decoder_attention=True, dropout=attention_dropout, bias=False,
+        )
+        self.layer_norm3 = LayerNorm(c)
+        self.ffn = TransformerFFNLayer(
+            c, 4 * c, padding='LEFT', kernel_size=kernel_size, dropout=relu_dropout, act=act)
+    def forward(
+            self,
+            x,
+            encoder_out=None,
+            encoder_padding_mask=None,
+            incremental_state=None,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+            attn_out=None,
+            reset_attn_weight=None,
+            **kwargs,
+    ):
+        layer_norm_training = kwargs.get('layer_norm_training', None)
+        if layer_norm_training is not None:
+            self.layer_norm1.training = layer_norm_training
+            self.layer_norm2.training = layer_norm_training
+            self.layer_norm3.training = layer_norm_training
+        residual = x
+        x = self.layer_norm1(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            attn_mask=self_attn_mask
+        )
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        residual = x
+        x = self.layer_norm2(x)
+        if encoder_out is not None:
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                enc_dec_attn_constraint_mask=None,
+                # utils.get_incremental_state(self, incremental_state, 'enc_dec_attn_constraint_mask'),
+                reset_attn_weight=reset_attn_weight
+            )
+            attn_logits = attn[1]
+        else:
+            assert attn_out is not None
+            x = self.encoder_attn.in_proj_v(attn_out.transpose(0, 1))
+            attn_logits = None
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        residual = x
+        x = self.layer_norm3(x)
+        x = self.ffn(x, incremental_state=incremental_state)
+        x = F.dropout(x, self.dropout, training=self.training)
+        x = residual + x
+        # if len(attn_logits.size()) > 3:
+        #    indices = attn_logits.softmax(-1).max(-1).values.sum(-1).argmax(-1)
+        #    attn_logits = attn_logits.gather(1,
+        #        indices[:, None, None, None].repeat(1, 1, attn_logits.size(-2), attn_logits.size(-1))).squeeze(1)
+        return x, attn_logits

modules/commons/ssim.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Adapted from https://github.com/Po-Hsun-Su/pytorch-ssim
+"""
+from math import exp
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
+    return window
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+    C1 = 0.01 ** 2
+    C2 = 0.03 ** 2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1)
+class SSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.window = create_window(window_size, self.channel)
+    def forward(self, img1, img2):
+        (_, channel, _, _) = img1.size()
+        if channel == self.channel and self.window.data.type() == img1.data.type():
+            window = self.window
+        else:
+            window = create_window(self.window_size, channel)
+            if img1.is_cuda:
+                window = window.cuda(img1.get_device())
+            window = window.type_as(img1)
+            self.window = window
+            self.channel = channel
+        return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
+window = None
+def ssim(img1, img2, window_size=11, size_average=True):
+    (_, channel, _, _) = img1.size()
+    global window
+    if window is None:
+        window = create_window(window_size, channel)
+        if img1.is_cuda:
+            window = window.cuda(img1.get_device())
+        window = window.type_as(img1)
+    return _ssim(img1, img2, window, window_size, channel, size_average)

modules/diff/__pycache__/diffusion.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

modules/diff/__pycache__/diffusion.cpython-38.pyc ADDED Viewed

Binary file (11 kB). View file

modules/diff/__pycache__/net.cpython-310.pyc ADDED Viewed

Binary file (4.57 kB). View file

modules/diff/__pycache__/net.cpython-38.pyc ADDED Viewed

Binary file (4.61 kB). View file

modules/diff/diffusion.py ADDED Viewed

	@@ -0,0 +1,312 @@

+from collections import deque
+from functools import partial
+from inspect import isfunction
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+from modules.encoder import SvcEncoder
+from training.train_pipeline import Batch2Loss
+from utils.hparams import hparams
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+# gaussian diffusion trainer class
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.01)):
+    """
+    linear schedule
+    """
+    betas = np.linspace(1e-4, max_beta, timesteps)
+    return betas
+def cosine_beta_schedule(timesteps, s=0.008):
+    """
+    cosine schedule
+    as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    """
+    steps = timesteps + 1
+    x = np.linspace(0, steps, steps)
+    alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return np.clip(betas, a_min=0, a_max=0.999)
+beta_schedule = {
+    "cosine": cosine_beta_schedule,
+    "linear": linear_beta_schedule,
+}
+class GaussianDiffusion(nn.Module):
+    def __init__(self, phone_encoder, out_dims, denoise_fn,
+                 timesteps=1000, K_step=1000, loss_type=hparams.get('diff_loss_type', 'l1'), betas=None, spec_min=None,
+                 spec_max=None):
+        super().__init__()
+        self.denoise_fn = denoise_fn
+        self.fs2 = SvcEncoder(phone_encoder, out_dims)
+        self.mel_bins = out_dims
+        if exists(betas):
+            betas = betas.detach().cpu().numpy() if isinstance(betas, torch.Tensor) else betas
+        else:
+            if 'schedule_type' in hparams.keys():
+                betas = beta_schedule[hparams['schedule_type']](timesteps)
+            else:
+                betas = cosine_beta_schedule(timesteps)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.K_step = K_step
+        self.loss_type = loss_type
+        self.noise_list = deque(maxlen=4)
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        self.register_buffer('posterior_variance', to_torch(posterior_variance))
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
+        self.register_buffer('posterior_mean_coef1', to_torch(
+            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
+        self.register_buffer('posterior_mean_coef2', to_torch(
+            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
+        self.register_buffer('spec_min', torch.FloatTensor(spec_min)[None, None, :hparams['keep_bins']])
+        self.register_buffer('spec_max', torch.FloatTensor(spec_max)[None, None, :hparams['keep_bins']])
+    def q_mean_variance(self, x_start, t):
+        mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = extract(1. - self.alphas_cumprod, t, x_start.shape)
+        log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+                extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+                extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+                extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+                extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, x, t, cond, clip_denoised: bool):
+        noise_pred = self.denoise_fn(x, t, cond=cond)
+        x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
+        return model_mean, posterior_variance, posterior_log_variance
+    @torch.no_grad()
+    def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
+        b, *_, device = *x.shape, x.device
+        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond, clip_denoised=clip_denoised)
+        noise = noise_like(x.shape, device, repeat_noise)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+    @torch.no_grad()
+    def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
+        """
+        Use the PLMS method from [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
+        """
+        def get_x_pred(x, noise_t, t):
+            a_t = extract(self.alphas_cumprod, t, x.shape)
+            a_prev = extract(self.alphas_cumprod, torch.max(t - interval, torch.zeros_like(t)), x.shape)
+            a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
+            x_delta = (a_prev - a_t) * ((1 / (a_t_sq * (a_t_sq + a_prev_sq))) * x - 1 / (
+                    a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t)
+            x_pred = x + x_delta
+            return x_pred
+        noise_list = self.noise_list
+        noise_pred = self.denoise_fn(x, t, cond=cond)
+        if len(noise_list) == 0:
+            x_pred = get_x_pred(x, noise_pred, t)
+            noise_pred_prev = self.denoise_fn(x_pred, max(t - interval, 0), cond=cond)
+            noise_pred_prime = (noise_pred + noise_pred_prev) / 2
+        elif len(noise_list) == 1:
+            noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
+        elif len(noise_list) == 2:
+            noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
+        elif len(noise_list) >= 3:
+            noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
+        x_prev = get_x_pred(x, noise_pred_prime, t)
+        noise_list.append(noise_pred)
+        return x_prev
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+                extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+                extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def p_losses(self, x_start, t, cond, noise=None, nonpadding=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        x_recon = self.denoise_fn(x_noisy, t, cond)
+        if self.loss_type == 'l1':
+            if nonpadding is not None:
+                loss = ((noise - x_recon).abs() * nonpadding.unsqueeze(1)).mean()
+            else:
+                # print('are you sure w/o nonpadding?')
+                loss = (noise - x_recon).abs().mean()
+        elif self.loss_type == 'l2':
+            loss = F.mse_loss(noise, x_recon)
+        else:
+            raise NotImplementedError()
+        return loss
+    def forward(self, hubert, mel2ph=None, spk_embed=None,
+                ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
+        '''
+            conditioning diffusion, use fastspeech2 encoder output as the condition
+        '''
+        ret = self.fs2(hubert, mel2ph, spk_embed, None, f0, uv, energy,
+                       skip_decoder=True, infer=infer, **kwargs)
+        cond = ret['decoder_inp'].transpose(1, 2)
+        b, *_, device = *hubert.shape, hubert.device
+        if not infer:
+            Batch2Loss.module4(
+                self.p_losses,
+                self.norm_spec(ref_mels), cond, ret, self.K_step, b, device
+            )
+        else:
+            if 'use_gt_mel' in kwargs.keys() and kwargs['use_gt_mel']:
+                t = kwargs['add_noise_step']
+                print('===>using ground truth mel as start, please make sure parameter "key==0" !')
+                fs2_mels = ref_mels
+                fs2_mels = self.norm_spec(fs2_mels)
+                fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
+                x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
+            else:
+                t = self.K_step
+                shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
+                x = torch.randn(shape, device=device)
+            if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1:
+                self.noise_list = deque(maxlen=4)
+                iteration_interval = hparams['pndm_speedup']
+                for i in tqdm(reversed(range(0, t, iteration_interval)), desc='sample time step',
+                              total=t // iteration_interval):
+                    x = self.p_sample_plms(x, torch.full((b,), i, device=device, dtype=torch.long), iteration_interval,
+                                           cond)
+            else:
+                for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
+                    x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
+            x = x[:, 0].transpose(1, 2)
+            if mel2ph is not None:  # for singing
+                ret['mel_out'] = self.denorm_spec(x) * ((mel2ph > 0).float()[:, :, None])
+            else:
+                ret['mel_out'] = self.denorm_spec(x)
+        return ret
+    def norm_spec(self, x):
+        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
+    def denorm_spec(self, x):
+        return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
+    def out2mel(self, x):
+        return x
+class OfflineGaussianDiffusion(GaussianDiffusion):
+    def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
+                ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
+        b, *_, device = *txt_tokens.shape, txt_tokens.device
+        ret = self.fs2(txt_tokens, mel2ph, spk_embed, ref_mels, f0, uv, energy,
+                       skip_decoder=True, infer=True, **kwargs)
+        cond = ret['decoder_inp'].transpose(1, 2)
+        fs2_mels = ref_mels[1]
+        ref_mels = ref_mels[0]
+        if not infer:
+            t = torch.randint(0, self.K_step, (b,), device=device).long()
+            x = ref_mels
+            x = self.norm_spec(x)
+            x = x.transpose(1, 2)[:, None, :, :]  # [B, 1, M, T]
+            ret['diff_loss'] = self.p_losses(x, t, cond)
+        else:
+            t = self.K_step
+            fs2_mels = self.norm_spec(fs2_mels)
+            fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
+            x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
+            if hparams.get('gaussian_start') is not None and hparams['gaussian_start']:
+                print('===> gaussion start.')
+                shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
+                x = torch.randn(shape, device=device)
+            for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
+                x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
+            x = x[:, 0].transpose(1, 2)
+            ret['mel_out'] = self.denorm_spec(x)
+        return ret

modules/diff/net.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import math
+from math import sqrt
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from modules.commons.common_layers import Mish
+from utils.hparams import hparams
+Linear = nn.Linear
+ConvTranspose2d = nn.ConvTranspose2d
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+    def override(self, attrs):
+        if isinstance(attrs, dict):
+            self.__dict__.update(**attrs)
+        elif isinstance(attrs, (list, tuple, set)):
+            for attr in attrs:
+                self.override(attr)
+        elif attrs is not None:
+            raise NotImplementedError
+        return self
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+def Conv1d(*args, **kwargs):
+    layer = nn.Conv1d(*args, **kwargs)
+    nn.init.kaiming_normal_(layer.weight)
+    return layer
+@torch.jit.script
+def silu(x):
+    return x * torch.sigmoid(x)
+class ResidualBlock(nn.Module):
+    def __init__(self, encoder_hidden, residual_channels, dilation):
+        super().__init__()
+        self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
+        self.diffusion_projection = Linear(residual_channels, residual_channels)
+        self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1)
+        self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
+    def forward(self, x, conditioner, diffusion_step):
+        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
+        conditioner = self.conditioner_projection(conditioner)
+        y = x + diffusion_step
+        y = self.dilated_conv(y) + conditioner
+        gate, filter = torch.chunk(y, 2, dim=1)
+        # Using torch.split instead of torch.chunk to avoid using onnx::Slice
+        # gate, filter = torch.split(y, torch.div(y.shape[1], 2), dim=1)
+        y = torch.sigmoid(gate) * torch.tanh(filter)
+        y = self.output_projection(y)
+        residual, skip = torch.chunk(y, 2, dim=1)
+        # Using torch.split instead of torch.chunk to avoid using onnx::Slice
+        # residual, skip = torch.split(y, torch.div(y.shape[1], 2), dim=1)
+        return (x + residual) / sqrt(2.0), skip
+class DiffNet(nn.Module):
+    def __init__(self, in_dims=80):
+        super().__init__()
+        self.params = params = AttrDict(
+            # Model params
+            encoder_hidden=hparams['hidden_size'],
+            residual_layers=hparams['residual_layers'],
+            residual_channels=hparams['residual_channels'],
+            dilation_cycle_length=hparams['dilation_cycle_length'],
+        )
+        self.input_projection = Conv1d(in_dims, params.residual_channels, 1)
+        self.diffusion_embedding = SinusoidalPosEmb(params.residual_channels)
+        dim = params.residual_channels
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            Mish(),
+            nn.Linear(dim * 4, dim)
+        )
+        self.residual_layers = nn.ModuleList([
+            ResidualBlock(params.encoder_hidden, params.residual_channels, 2 ** (i % params.dilation_cycle_length))
+            for i in range(params.residual_layers)
+        ])
+        self.skip_projection = Conv1d(params.residual_channels, params.residual_channels, 1)
+        self.output_projection = Conv1d(params.residual_channels, in_dims, 1)
+        nn.init.zeros_(self.output_projection.weight)
+    def forward(self, spec, diffusion_step, cond):
+        """
+        :param spec: [B, 1, M, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, M, T]
+        :return:
+        """
+        x = spec[:, 0]
+        x = self.input_projection(x)  # x [B, residual_channel, T]
+        x = F.relu(x)
+        diffusion_step = self.diffusion_embedding(diffusion_step)
+        diffusion_step = self.mlp(diffusion_step)
+        skip = []
+        for layer_id, layer in enumerate(self.residual_layers):
+            x, skip_connection = layer(x, cond, diffusion_step)
+            skip.append(skip_connection)
+        x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
+        x = self.skip_projection(x)
+        x = F.relu(x)
+        x = self.output_projection(x)  # [B, 80, T]
+        return x[:, None, :, :]

modules/encoder.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import torch
+from modules.commons.common_layers import *
+from modules.commons.common_layers import Embedding
+from modules.commons.common_layers import SinusoidalPositionalEmbedding
+from utils.hparams import hparams
+from utils.pitch_utils import f0_to_coarse, denorm_f0
+class LayerNorm(torch.nn.LayerNorm):
+    """Layer normalization module.
+    :param int nout: output dim size
+    :param int dim: dimension to be normalized
+    """
+    def __init__(self, nout, dim=-1):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+    def forward(self, x):
+        """Apply layer normalization.
+        :param torch.Tensor x: input tensor
+        :return: layer normalized tensor
+        :rtype torch.Tensor
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
+class PitchPredictor(torch.nn.Module):
+    def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5,
+                 dropout_rate=0.1, padding='SAME'):
+        """Initilize pitch predictor module.
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+        """
+        super(PitchPredictor, self).__init__()
+        self.conv = torch.nn.ModuleList()
+        self.kernel_size = kernel_size
+        self.padding = padding
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv += [torch.nn.Sequential(
+                torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
+                                       if padding == 'SAME'
+                                       else (kernel_size - 1, 0), 0),
+                torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
+                torch.nn.ReLU(),
+                LayerNorm(n_chans, dim=1),
+                torch.nn.Dropout(dropout_rate)
+            )]
+        self.linear = torch.nn.Linear(n_chans, odim)
+        self.embed_positions = SinusoidalPositionalEmbedding(idim, 0, init_size=4096)
+        self.pos_embed_alpha = nn.Parameter(torch.Tensor([1]))
+    def forward(self, xs):
+        """
+        :param xs: [B, T, H]
+        :return: [B, T, H]
+        """
+        positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
+        xs = xs + positions
+        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
+        for f in self.conv:
+            xs = f(xs)  # (B, C, Tmax)
+        # NOTE: calculate in log domain
+        xs = self.linear(xs.transpose(1, -1))  # (B, Tmax, H)
+        return xs
+class SvcEncoder(nn.Module):
+    def __init__(self, dictionary, out_dims=None):
+        super().__init__()
+        # self.dictionary = dictionary
+        self.padding_idx = 0
+        self.hidden_size = hparams['hidden_size']
+        self.out_dims = out_dims
+        if out_dims is None:
+            self.out_dims = hparams['audio_num_mel_bins']
+        self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True)
+        predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
+        if hparams['use_pitch_embed']:
+            self.pitch_embed = Embedding(300, self.hidden_size, self.padding_idx)
+            self.pitch_predictor = PitchPredictor(
+                self.hidden_size,
+                n_chans=predictor_hidden,
+                n_layers=hparams['predictor_layers'],
+                dropout_rate=hparams['predictor_dropout'],
+                odim=2 if hparams['pitch_type'] == 'frame' else 1,
+                padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel'])
+        if hparams['use_energy_embed']:
+            self.energy_embed = Embedding(256, self.hidden_size, self.padding_idx)
+        if hparams['use_spk_id']:
+            self.spk_embed_proj = Embedding(hparams['num_spk'], self.hidden_size)
+            if hparams['use_split_spk_id']:
+                self.spk_embed_f0 = Embedding(hparams['num_spk'], self.hidden_size)
+                self.spk_embed_dur = Embedding(hparams['num_spk'], self.hidden_size)
+        elif hparams['use_spk_embed']:
+            self.spk_embed_proj = Linear(256, self.hidden_size, bias=True)
+    def forward(self, hubert, mel2ph=None, spk_embed=None,
+                ref_mels=None, f0=None, uv=None, energy=None, skip_decoder=True,
+                spk_embed_dur_id=None, spk_embed_f0_id=None, infer=False, **kwargs):
+        ret = {}
+        encoder_out = hubert
+        src_nonpadding = (hubert != 0).any(-1)[:, :, None]
+        # add ref style embed
+        # Not implemented
+        # variance encoder
+        var_embed = 0
+        # encoder_out_dur denotes encoder outputs for duration predictor
+        # in speech adaptation, duration predictor use old speaker embedding
+        if hparams['use_spk_embed']:
+            spk_embed_dur = spk_embed_f0 = spk_embed = self.spk_embed_proj(spk_embed)[:, None, :]
+        elif hparams['use_spk_id']:
+            spk_embed_id = spk_embed
+            if spk_embed_dur_id is None:
+                spk_embed_dur_id = spk_embed_id
+            if spk_embed_f0_id is None:
+                spk_embed_f0_id = spk_embed_id
+            spk_embed_0 = self.spk_embed_proj(spk_embed_id.to(hubert.device))[:, None, :]
+            spk_embed_1 = self.spk_embed_proj(torch.LongTensor([0]).to(hubert.device))[:, None, :]
+            spk_embed_2 = self.spk_embed_proj(torch.LongTensor([0]).to(hubert.device))[:, None, :]
+            spk_embed = 1 * spk_embed_0 + 0 * spk_embed_1 + 0 * spk_embed_2
+            spk_embed_dur = spk_embed_f0 = spk_embed
+            if hparams['use_split_spk_id']:
+                spk_embed_dur = self.spk_embed_dur(spk_embed_dur_id)[:, None, :]
+                spk_embed_f0 = self.spk_embed_f0(spk_embed_f0_id)[:, None, :]
+        else:
+            spk_embed_dur = spk_embed_f0 = spk_embed = 0
+        ret['mel2ph'] = mel2ph
+        decoder_inp = F.pad(encoder_out, [0, 0, 1, 0])
+        mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]])
+        decoder_inp_origin = decoder_inp = torch.gather(decoder_inp, 1, mel2ph_)  # [B, T, H]
+        tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
+        # add pitch and energy embed
+        pitch_inp = (decoder_inp_origin + var_embed + spk_embed_f0) * tgt_nonpadding
+        if hparams['use_pitch_embed']:
+            pitch_inp_ph = (encoder_out + var_embed + spk_embed_f0) * src_nonpadding
+            decoder_inp = decoder_inp + self.add_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out=pitch_inp_ph)
+        if hparams['use_energy_embed']:
+            decoder_inp = decoder_inp + self.add_energy(pitch_inp, energy, ret)
+        ret['decoder_inp'] = decoder_inp = (decoder_inp + spk_embed) * tgt_nonpadding
+        return ret
+    def add_dur(self, dur_input, mel2ph, hubert, ret):
+        src_padding = (hubert == 0).all(-1)
+        dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
+        if mel2ph is None:
+            dur, xs = self.dur_predictor.inference(dur_input, src_padding)
+            ret['dur'] = xs
+            ret['dur_choice'] = dur
+            mel2ph = self.length_regulator(dur, src_padding).detach()
+        else:
+            ret['dur'] = self.dur_predictor(dur_input, src_padding)
+        ret['mel2ph'] = mel2ph
+        return mel2ph
+    def run_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs):
+        x = decoder_inp  # [B, T, H]
+        x = self.mel_out(x)
+        return x * tgt_nonpadding
+    def out2mel(self, out):
+        return out
+    def add_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None):
+        decoder_inp = decoder_inp.detach() + hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
+        pitch_padding = (mel2ph == 0)
+        ret['f0_denorm'] = f0_denorm = denorm_f0(f0, uv, hparams, pitch_padding=pitch_padding)
+        if pitch_padding is not None:
+            f0[pitch_padding] = 0
+        pitch = f0_to_coarse(f0_denorm, hparams)  # start from 0
+        ret['pitch_pred'] = pitch.unsqueeze(-1)
+        pitch_embedding = self.pitch_embed(pitch)
+        return pitch_embedding
+    def add_energy(self, decoder_inp, energy, ret):
+        decoder_inp = decoder_inp.detach() + hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
+        ret['energy_pred'] = energy  # energy_pred = self.energy_predictor(decoder_inp)[:, :, 0]
+        energy = torch.clamp(energy * 256 // 4, max=255).long()  # energy_to_coarse
+        energy_embedding = self.energy_embed(energy)
+        return energy_embedding
+    @staticmethod
+    def mel_norm(x):
+        return (x + 5.5) / (6.3 / 2) - 1
+    @staticmethod
+    def mel_denorm(x):
+        return (x + 1) * (6.3 / 2) - 5.5

modules/hubert/__pycache__/cn_hubert.cpython-38.pyc ADDED Viewed

Binary file (1.32 kB). View file

modules/hubert/__pycache__/hubert_model.cpython-38.pyc ADDED Viewed

Binary file (8.38 kB). View file

modules/hubert/__pycache__/hubert_onnx.cpython-38.pyc ADDED Viewed

Binary file (735 Bytes). View file

modules/hubert/cn_hubert.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import librosa
+import torch
+import torch.nn as nn
+def load_cn_model(ch_hubert_path):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    from fairseq import checkpoint_utils
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [ch_hubert_path],
+        suffix="",
+    )
+    model = models[0]
+    model = model.to(device)
+    model.eval()
+    return model
+def get_cn_hubert_units(con_model, audio_path, dev):
+    audio, sampling_rate = librosa.load(audio_path)
+    if len(audio.shape) > 1:
+        audio = librosa.to_mono(audio.transpose(1, 0))
+    if sampling_rate != 16000:
+        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+    feats = torch.from_numpy(audio).float()
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    feats = feats.view(1, -1)
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.to(dev),
+        "padding_mask": padding_mask.to(dev),
+        "output_layer": 9,  # layer 9
+    }
+    with torch.no_grad():
+        logits = con_model.extract_features(**inputs)
+        feats = con_model.final_proj(logits[0])
+    return feats

modules/hubert/hubert_model.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import copy
+import random
+from typing import Optional, Tuple
+import librosa
+import torch
+import torch.nn as nn
+import torch.nn.functional as t_func
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+    def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+    # @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = torch.nn.functional.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+    def forward(self, wav: torch.Tensor):
+        return self.units(wav)
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
+        return x
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+class TransformerEncoder(nn.Module):
+    def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+    def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+    return mask
+def hubert_soft(
+        path: str
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval().to(dev)
+    return hubert
+def get_units(hbt_soft, raw_wav_path, dev=torch.device('cuda')):
+    wav, sr = librosa.load(raw_wav_path, sr=None)
+    assert (sr >= 16000)
+    if len(wav.shape) > 1:
+        wav = librosa.to_mono(wav)
+    if sr != 16000:
+        wav16 = librosa.resample(wav, sr, 16000)
+    else:
+        wav16 = wav
+    dev = torch.device("cuda" if (dev == torch.device('cuda') and torch.cuda.is_available()) else "cpu")
+    torch.cuda.is_available() and torch.cuda.empty_cache()
+    with torch.inference_mode():
+        units = hbt_soft.units(torch.FloatTensor(wav16.astype(float)).unsqueeze(0).unsqueeze(0).to(dev))
+        return units

modules/hubert/hubert_onnx.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import time
+import torch
+import torchaudio
+def get_onnx_units(hbt_soft, raw_wav_path):
+    source, sr = torchaudio.load(raw_wav_path)
+    source = torchaudio.functional.resample(source, sr, 16000)
+    if len(source.shape) == 2 and source.shape[1] >= 2:
+        source = torch.mean(source, dim=0).unsqueeze(0)
+    source = source.unsqueeze(0)
+    # 使用ONNX Runtime进行推理
+    start = time.time()
+    units = hbt_soft.run(output_names=["units"],
+                         input_feed={"wav": source.numpy()})[0]
+    use_time = time.time() - start
+    print("hubert_onnx_session.run time:{}".format(use_time))
+    return units

modules/nsf_hifigan/__pycache__/env.cpython-310.pyc ADDED Viewed

Binary file (813 Bytes). View file

modules/nsf_hifigan/__pycache__/env.cpython-38.pyc ADDED Viewed

Binary file (799 Bytes). View file

modules/nsf_hifigan/__pycache__/models.cpython-310.pyc ADDED Viewed

Binary file (16.1 kB). View file

modules/nsf_hifigan/__pycache__/models.cpython-38.pyc ADDED Viewed

Binary file (16.3 kB). View file

modules/nsf_hifigan/__pycache__/nvSTFT.cpython-310.pyc ADDED Viewed

Binary file (3.78 kB). View file

modules/nsf_hifigan/__pycache__/nvSTFT.cpython-38.pyc ADDED Viewed

Binary file (3.84 kB). View file

modules/nsf_hifigan/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.35 kB). View file