Spaces:
Build error
Build error
ChrisPreston
commited on
Commit
·
93f4bab
1
Parent(s):
ebe9a08
Upload 95 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- aqua/clean_model_ckpt_steps_100000.ckpt +3 -0
- aqua/config.yaml +457 -0
- checkpoints/0102_xiaoma_pe/config.yaml +172 -0
- checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt +3 -0
- checkpoints/hubert/hubert.onnx +3 -0
- checkpoints/hubert/hubert_soft.pt +3 -0
- checkpoints/nsf_hifigan/NOTICE.txt +74 -0
- checkpoints/nsf_hifigan/config.json +38 -0
- checkpoints/nsf_hifigan/model +3 -0
- infer.py +81 -0
- infer_tools/__pycache__/f0_static.cpython-38.pyc +0 -0
- infer_tools/__pycache__/infer_tool.cpython-38.pyc +0 -0
- infer_tools/__pycache__/infer_tool_beta.cpython-38.pyc +0 -0
- infer_tools/__pycache__/slicer.cpython-38.pyc +0 -0
- infer_tools/__pycache__/trans_key.cpython-38.pyc +0 -0
- infer_tools/f0_static.py +116 -0
- infer_tools/f0_temp.json +0 -0
- infer_tools/infer_tool.py +201 -0
- infer_tools/infer_tool_beta.py +229 -0
- infer_tools/slicer.py +142 -0
- infer_tools/trans_key.py +67 -0
- modules/__pycache__/encoder.cpython-310.pyc +0 -0
- modules/__pycache__/encoder.cpython-38.pyc +0 -0
- modules/commons/__pycache__/common_layers.cpython-310.pyc +0 -0
- modules/commons/__pycache__/common_layers.cpython-38.pyc +0 -0
- modules/commons/__pycache__/ssim.cpython-310.pyc +0 -0
- modules/commons/__pycache__/ssim.cpython-38.pyc +0 -0
- modules/commons/common_layers.py +675 -0
- modules/commons/ssim.py +84 -0
- modules/diff/__pycache__/diffusion.cpython-310.pyc +0 -0
- modules/diff/__pycache__/diffusion.cpython-38.pyc +0 -0
- modules/diff/__pycache__/net.cpython-310.pyc +0 -0
- modules/diff/__pycache__/net.cpython-38.pyc +0 -0
- modules/diff/diffusion.py +312 -0
- modules/diff/net.py +135 -0
- modules/encoder.py +208 -0
- modules/hubert/__pycache__/cn_hubert.cpython-38.pyc +0 -0
- modules/hubert/__pycache__/hubert_model.cpython-38.pyc +0 -0
- modules/hubert/__pycache__/hubert_onnx.cpython-38.pyc +0 -0
- modules/hubert/cn_hubert.py +40 -0
- modules/hubert/hubert_model.py +243 -0
- modules/hubert/hubert_onnx.py +19 -0
- modules/nsf_hifigan/__pycache__/env.cpython-310.pyc +0 -0
- modules/nsf_hifigan/__pycache__/env.cpython-38.pyc +0 -0
- modules/nsf_hifigan/__pycache__/models.cpython-310.pyc +0 -0
- modules/nsf_hifigan/__pycache__/models.cpython-38.pyc +0 -0
- modules/nsf_hifigan/__pycache__/nvSTFT.cpython-310.pyc +0 -0
- modules/nsf_hifigan/__pycache__/nvSTFT.cpython-38.pyc +0 -0
- modules/nsf_hifigan/__pycache__/utils.cpython-310.pyc +0 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
checkpoints/nsf_hifigan/model filter=lfs diff=lfs merge=lfs -text
|
aqua/clean_model_ckpt_steps_100000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:14d1e9bf1dde30fcb397ebf91e61e77fc34cf22f6d1d6fd112eba57113a75795
|
3 |
+
size 227124201
|
aqua/config.yaml
ADDED
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
K_step: 1000
|
2 |
+
accumulate_grad_batches: 1
|
3 |
+
audio_num_mel_bins: 128
|
4 |
+
audio_sample_rate: 44100
|
5 |
+
binarization_args:
|
6 |
+
shuffle: false
|
7 |
+
with_align: true
|
8 |
+
with_f0: true
|
9 |
+
with_hubert: true
|
10 |
+
with_spk_embed: false
|
11 |
+
with_wav: false
|
12 |
+
binarizer_cls: preprocessing.SVCpre.SVCBinarizer
|
13 |
+
binary_data_dir: data/binary/aquapre
|
14 |
+
check_val_every_n_epoch: 10
|
15 |
+
choose_test_manually: false
|
16 |
+
clip_grad_norm: 1
|
17 |
+
config_path: F:\diff-svc-main\training\config_nsf.yaml
|
18 |
+
content_cond_steps: []
|
19 |
+
cwt_add_f0_loss: false
|
20 |
+
cwt_hidden_size: 128
|
21 |
+
cwt_layers: 2
|
22 |
+
cwt_loss: l1
|
23 |
+
cwt_std_scale: 0.8
|
24 |
+
datasets:
|
25 |
+
- opencpop
|
26 |
+
debug: false
|
27 |
+
dec_ffn_kernel_size: 9
|
28 |
+
dec_layers: 4
|
29 |
+
decay_steps: 20000
|
30 |
+
decoder_type: fft
|
31 |
+
dict_dir: ''
|
32 |
+
diff_decoder_type: wavenet
|
33 |
+
diff_loss_type: l2
|
34 |
+
dilation_cycle_length: 4
|
35 |
+
dropout: 0.1
|
36 |
+
ds_workers: 4
|
37 |
+
dur_enc_hidden_stride_kernel:
|
38 |
+
- 0,2,3
|
39 |
+
- 0,2,3
|
40 |
+
- 0,1,3
|
41 |
+
dur_loss: mse
|
42 |
+
dur_predictor_kernel: 3
|
43 |
+
dur_predictor_layers: 5
|
44 |
+
enc_ffn_kernel_size: 9
|
45 |
+
enc_layers: 4
|
46 |
+
encoder_K: 8
|
47 |
+
encoder_type: fft
|
48 |
+
endless_ds: false
|
49 |
+
f0_bin: 256
|
50 |
+
f0_max: 1100.0
|
51 |
+
f0_min: 40.0
|
52 |
+
f0_static: '{"28.0": 0.07, "29.0": 0.03, "31.0": 0.05, "32.0": 0.08, "33.0": 0.12,
|
53 |
+
"34.0": 0.02, "35.0": 0.06, "36.0": 0.02, "37.0": 0.01, "38.0": 0.1, "39.0": 0.05,
|
54 |
+
"40.0": 0.09, "41.0": 0.14, "42.0": 0.16, "43.0": 0.03, "44.0": 0.42, "45.0": 0.74,
|
55 |
+
"46.0": 1.13, "47.0": 1.49, "48.0": 1.76, "49.0": 2.59, "50.0": 3.03, "51.0": 2.71,
|
56 |
+
"52.0": 1.93, "53.0": 1.11, "54.0": 0.78, "55.0": 3.33, "56.0": 20.38, "57.0": 69.6,
|
57 |
+
"58.0": 167.04, "59.0": 245.1, "60.0": 318.87, "61.0": 373.41, "62.0": 434.86, "63.0":
|
58 |
+
415.63, "64.0": 448.97, "65.0": 452.99, "66.0": 474.88, "67.0": 471.54, "68.0":
|
59 |
+
455.78, "69.0": 421.71, "70.0": 372.06, "71.0": 323.85, "72.0": 292.8, "73.0": 238.94,
|
60 |
+
"74.0": 190.5, "75.0": 132.86, "76.0": 88.03, "77.0": 53.16, "78.0": 32.96, "79.0":
|
61 |
+
23.66, "80.0": 14.74, "81.0": 8.54, "82.0": 5.0, "83.0": 3.32, "84.0": 2.29, "85.0":
|
62 |
+
0.91, "total_time": 6576.43}'
|
63 |
+
ffn_act: gelu
|
64 |
+
ffn_padding: SAME
|
65 |
+
fft_size: 2048
|
66 |
+
fmax: 16000
|
67 |
+
fmin: 40
|
68 |
+
fs2_ckpt: ''
|
69 |
+
gaussian_start: true
|
70 |
+
gen_dir_name: ''
|
71 |
+
gen_tgt_spk_id: -1
|
72 |
+
hidden_size: 256
|
73 |
+
hop_size: 512
|
74 |
+
hubert_gpu: true
|
75 |
+
hubert_path: checkpoints/hubert/hubert_soft.pt
|
76 |
+
infer: false
|
77 |
+
keep_bins: 128
|
78 |
+
lambda_commit: 0.25
|
79 |
+
lambda_energy: 0.0
|
80 |
+
lambda_f0: 1.0
|
81 |
+
lambda_ph_dur: 0.3
|
82 |
+
lambda_sent_dur: 1.0
|
83 |
+
lambda_uv: 1.0
|
84 |
+
lambda_word_dur: 1.0
|
85 |
+
load_ckpt: ''
|
86 |
+
log_interval: 100
|
87 |
+
loud_norm: false
|
88 |
+
lr: 0.0008
|
89 |
+
max_beta: 0.02
|
90 |
+
max_epochs: 3000
|
91 |
+
max_eval_sentences: 1
|
92 |
+
max_eval_tokens: 60000
|
93 |
+
max_frames: 42000
|
94 |
+
max_input_tokens: 6000
|
95 |
+
max_sentences: 88
|
96 |
+
max_tokens: 128000
|
97 |
+
max_updates: 1000000
|
98 |
+
mel_loss: ssim:0.5|l1:0.5
|
99 |
+
mel_vmax: 1.5
|
100 |
+
mel_vmin: -6.0
|
101 |
+
min_level_db: -120
|
102 |
+
no_fs2: true
|
103 |
+
norm_type: gn
|
104 |
+
num_ckpt_keep: 10
|
105 |
+
num_heads: 2
|
106 |
+
num_sanity_val_steps: 1
|
107 |
+
num_spk: 1
|
108 |
+
num_test_samples: 0
|
109 |
+
num_valid_plots: 10
|
110 |
+
optimizer_adam_beta1: 0.9
|
111 |
+
optimizer_adam_beta2: 0.98
|
112 |
+
out_wav_norm: false
|
113 |
+
pe_ckpt: checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
|
114 |
+
pe_enable: false
|
115 |
+
perform_enhance: true
|
116 |
+
pitch_ar: false
|
117 |
+
pitch_enc_hidden_stride_kernel:
|
118 |
+
- 0,2,5
|
119 |
+
- 0,2,5
|
120 |
+
- 0,2,5
|
121 |
+
pitch_extractor: parselmouth
|
122 |
+
pitch_loss: l2
|
123 |
+
pitch_norm: log
|
124 |
+
pitch_type: frame
|
125 |
+
pndm_speedup: 10
|
126 |
+
pre_align_args:
|
127 |
+
allow_no_txt: false
|
128 |
+
denoise: false
|
129 |
+
forced_align: mfa
|
130 |
+
txt_processor: zh_g2pM
|
131 |
+
use_sox: true
|
132 |
+
use_tone: false
|
133 |
+
pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
|
134 |
+
predictor_dropout: 0.5
|
135 |
+
predictor_grad: 0.1
|
136 |
+
predictor_hidden: -1
|
137 |
+
predictor_kernel: 5
|
138 |
+
predictor_layers: 5
|
139 |
+
prenet_dropout: 0.5
|
140 |
+
prenet_hidden_size: 256
|
141 |
+
pretrain_fs_ckpt: ''
|
142 |
+
processed_data_dir: xxx
|
143 |
+
profile_infer: false
|
144 |
+
raw_data_dir: data/raw/aquapre
|
145 |
+
ref_norm_layer: bn
|
146 |
+
rel_pos: true
|
147 |
+
reset_phone_dict: true
|
148 |
+
residual_channels: 512
|
149 |
+
residual_layers: 20
|
150 |
+
save_best: false
|
151 |
+
save_ckpt: true
|
152 |
+
save_codes:
|
153 |
+
- configs
|
154 |
+
- modules
|
155 |
+
- src
|
156 |
+
- utils
|
157 |
+
save_f0: true
|
158 |
+
save_gt: false
|
159 |
+
schedule_type: linear
|
160 |
+
seed: 1234
|
161 |
+
sort_by_len: true
|
162 |
+
speaker_id: aqua
|
163 |
+
spec_max:
|
164 |
+
- 0.18377557396888733
|
165 |
+
- -0.33469653129577637
|
166 |
+
- -0.3073468506336212
|
167 |
+
- -0.21027648448944092
|
168 |
+
- 0.23178215324878693
|
169 |
+
- 0.5297451019287109
|
170 |
+
- 0.7021887898445129
|
171 |
+
- 0.7711099982261658
|
172 |
+
- 0.7912386059761047
|
173 |
+
- 0.6609739065170288
|
174 |
+
- 0.649876058101654
|
175 |
+
- 0.6327046751976013
|
176 |
+
- 0.6892049908638
|
177 |
+
- 0.6026111841201782
|
178 |
+
- 0.6834777593612671
|
179 |
+
- 0.7417489886283875
|
180 |
+
- 0.6040375828742981
|
181 |
+
- 0.5854794383049011
|
182 |
+
- 0.7123280167579651
|
183 |
+
- 0.5886657238006592
|
184 |
+
- 0.6135984063148499
|
185 |
+
- 0.5388530492782593
|
186 |
+
- 0.5932422280311584
|
187 |
+
- 0.535581111907959
|
188 |
+
- 0.57913738489151
|
189 |
+
- 0.6827316880226135
|
190 |
+
- 0.6265526413917542
|
191 |
+
- 0.6557696461677551
|
192 |
+
- 0.6586976647377014
|
193 |
+
- 0.5687282085418701
|
194 |
+
- 0.6218562722206116
|
195 |
+
- 0.6349128484725952
|
196 |
+
- 0.6176865100860596
|
197 |
+
- 0.6212958097457886
|
198 |
+
- 0.6277656555175781
|
199 |
+
- 0.5551338195800781
|
200 |
+
- 0.6126622557640076
|
201 |
+
- 0.5821346640586853
|
202 |
+
- 0.577056348323822
|
203 |
+
- 0.5649800300598145
|
204 |
+
- 0.5984634757041931
|
205 |
+
- 0.4873456656932831
|
206 |
+
- 0.47209471464157104
|
207 |
+
- 0.4387756586074829
|
208 |
+
- 0.4690910577774048
|
209 |
+
- 0.4616055190563202
|
210 |
+
- 0.3555675446987152
|
211 |
+
- 0.3898852467536926
|
212 |
+
- 0.3676068186759949
|
213 |
+
- 0.4632047414779663
|
214 |
+
- 0.37983986735343933
|
215 |
+
- 0.3877682685852051
|
216 |
+
- 0.3099276125431061
|
217 |
+
- 0.3261813223361969
|
218 |
+
- 0.34168118238449097
|
219 |
+
- 0.3004901111125946
|
220 |
+
- 0.3512653112411499
|
221 |
+
- 0.2647061347961426
|
222 |
+
- 0.2685043215751648
|
223 |
+
- 0.20390087366104126
|
224 |
+
- 0.1825377196073532
|
225 |
+
- 0.22067485749721527
|
226 |
+
- 0.20306138694286346
|
227 |
+
- 0.12710601091384888
|
228 |
+
- 0.10927848517894745
|
229 |
+
- 0.1117628887295723
|
230 |
+
- 0.14148156344890594
|
231 |
+
- 0.122605100274086
|
232 |
+
- 0.08032718300819397
|
233 |
+
- 0.12159623205661774
|
234 |
+
- -0.04923255369067192
|
235 |
+
- -0.07824847847223282
|
236 |
+
- 0.03441360592842102
|
237 |
+
- 0.07093964517116547
|
238 |
+
- -0.1269683688879013
|
239 |
+
- 0.0027632638812065125
|
240 |
+
- -0.045093610882759094
|
241 |
+
- -0.04115259647369385
|
242 |
+
- 0.029067598283290863
|
243 |
+
- -0.009453626349568367
|
244 |
+
- -0.0470033697783947
|
245 |
+
- -0.04894810542464256
|
246 |
+
- -0.06236470118165016
|
247 |
+
- -0.20086997747421265
|
248 |
+
- -0.2363593578338623
|
249 |
+
- -0.17289961874485016
|
250 |
+
- -0.219277486205101
|
251 |
+
- -0.2934815585613251
|
252 |
+
- -0.30551621317863464
|
253 |
+
- -0.2513120770454407
|
254 |
+
- -0.26792851090431213
|
255 |
+
- -0.33068278431892395
|
256 |
+
- -0.37532031536102295
|
257 |
+
- -0.365634560585022
|
258 |
+
- -0.3379015326499939
|
259 |
+
- -0.26979681849479675
|
260 |
+
- -0.20316314697265625
|
261 |
+
- -0.2109878957271576
|
262 |
+
- -0.16927000880241394
|
263 |
+
- -0.1698305308818817
|
264 |
+
- -0.2739156186580658
|
265 |
+
- -0.2700604200363159
|
266 |
+
- -0.32284122705459595
|
267 |
+
- -0.44529229402542114
|
268 |
+
- -0.4002469480037689
|
269 |
+
- -0.2441970407962799
|
270 |
+
- -0.19795942306518555
|
271 |
+
- -0.2462945580482483
|
272 |
+
- -0.0673084482550621
|
273 |
+
- -0.22117790579795837
|
274 |
+
- -0.21418607234954834
|
275 |
+
- -0.39467209577560425
|
276 |
+
- -0.4388139843940735
|
277 |
+
- -0.3227368891239166
|
278 |
+
- -0.30530503392219543
|
279 |
+
- -0.3201104998588562
|
280 |
+
- -0.39839836955070496
|
281 |
+
- -0.464596688747406
|
282 |
+
- -0.5399728417396545
|
283 |
+
- -0.5515261292457581
|
284 |
+
- -0.520453691482544
|
285 |
+
- -0.6714966893196106
|
286 |
+
- -0.6414765119552612
|
287 |
+
- -0.6108742356300354
|
288 |
+
- -0.6762520670890808
|
289 |
+
- -0.7067146301269531
|
290 |
+
- -0.7586700320243835
|
291 |
+
- -0.6640384793281555
|
292 |
+
spec_min:
|
293 |
+
- -4.999994277954102
|
294 |
+
- -4.999994277954102
|
295 |
+
- -4.999994277954102
|
296 |
+
- -4.999994277954102
|
297 |
+
- -4.999994277954102
|
298 |
+
- -4.999994277954102
|
299 |
+
- -4.999994277954102
|
300 |
+
- -4.999994277954102
|
301 |
+
- -4.999994277954102
|
302 |
+
- -4.999994277954102
|
303 |
+
- -4.999994277954102
|
304 |
+
- -4.999994277954102
|
305 |
+
- -4.999994277954102
|
306 |
+
- -4.999994277954102
|
307 |
+
- -4.999994277954102
|
308 |
+
- -4.999994277954102
|
309 |
+
- -4.999994277954102
|
310 |
+
- -4.999994277954102
|
311 |
+
- -4.999994277954102
|
312 |
+
- -4.999994277954102
|
313 |
+
- -4.999994277954102
|
314 |
+
- -4.999994277954102
|
315 |
+
- -4.999994277954102
|
316 |
+
- -4.999994277954102
|
317 |
+
- -4.999994277954102
|
318 |
+
- -4.999994277954102
|
319 |
+
- -4.999994277954102
|
320 |
+
- -4.999994277954102
|
321 |
+
- -4.999994277954102
|
322 |
+
- -4.999994277954102
|
323 |
+
- -4.999994277954102
|
324 |
+
- -4.999994277954102
|
325 |
+
- -4.999994277954102
|
326 |
+
- -4.999994277954102
|
327 |
+
- -4.999994277954102
|
328 |
+
- -4.999994277954102
|
329 |
+
- -4.999994277954102
|
330 |
+
- -4.999994277954102
|
331 |
+
- -4.999994277954102
|
332 |
+
- -4.999994277954102
|
333 |
+
- -4.999994277954102
|
334 |
+
- -4.999994277954102
|
335 |
+
- -4.999994277954102
|
336 |
+
- -4.999994277954102
|
337 |
+
- -4.999994277954102
|
338 |
+
- -4.999994277954102
|
339 |
+
- -4.999994277954102
|
340 |
+
- -4.999994277954102
|
341 |
+
- -4.999994277954102
|
342 |
+
- -4.999994277954102
|
343 |
+
- -4.999994277954102
|
344 |
+
- -4.999994277954102
|
345 |
+
- -4.999994277954102
|
346 |
+
- -4.999994277954102
|
347 |
+
- -4.999994277954102
|
348 |
+
- -4.999994277954102
|
349 |
+
- -4.999994277954102
|
350 |
+
- -4.999994277954102
|
351 |
+
- -4.999994277954102
|
352 |
+
- -4.999994277954102
|
353 |
+
- -4.999994277954102
|
354 |
+
- -4.999994277954102
|
355 |
+
- -4.999994277954102
|
356 |
+
- -4.999994277954102
|
357 |
+
- -4.999994277954102
|
358 |
+
- -4.999994277954102
|
359 |
+
- -4.999994277954102
|
360 |
+
- -4.999994277954102
|
361 |
+
- -4.999994277954102
|
362 |
+
- -4.999994277954102
|
363 |
+
- -4.999994277954102
|
364 |
+
- -4.999994277954102
|
365 |
+
- -4.999994277954102
|
366 |
+
- -4.999994277954102
|
367 |
+
- -4.999994277954102
|
368 |
+
- -4.999994277954102
|
369 |
+
- -4.999994277954102
|
370 |
+
- -4.999994277954102
|
371 |
+
- -4.999994277954102
|
372 |
+
- -4.999994277954102
|
373 |
+
- -4.999994277954102
|
374 |
+
- -4.999994277954102
|
375 |
+
- -4.999994277954102
|
376 |
+
- -4.999994277954102
|
377 |
+
- -4.999994277954102
|
378 |
+
- -4.999994277954102
|
379 |
+
- -4.999994277954102
|
380 |
+
- -4.999994277954102
|
381 |
+
- -4.999994277954102
|
382 |
+
- -4.999994277954102
|
383 |
+
- -4.999994277954102
|
384 |
+
- -4.999994277954102
|
385 |
+
- -4.999994277954102
|
386 |
+
- -4.999994277954102
|
387 |
+
- -4.999994277954102
|
388 |
+
- -4.999994277954102
|
389 |
+
- -4.999994277954102
|
390 |
+
- -4.999994277954102
|
391 |
+
- -4.999994277954102
|
392 |
+
- -4.999994277954102
|
393 |
+
- -4.999994277954102
|
394 |
+
- -4.999994277954102
|
395 |
+
- -4.999994277954102
|
396 |
+
- -4.999994277954102
|
397 |
+
- -4.999994277954102
|
398 |
+
- -4.999994277954102
|
399 |
+
- -4.999994277954102
|
400 |
+
- -4.999994277954102
|
401 |
+
- -4.999994277954102
|
402 |
+
- -4.999994277954102
|
403 |
+
- -4.999994277954102
|
404 |
+
- -4.999994277954102
|
405 |
+
- -4.999994277954102
|
406 |
+
- -4.999994277954102
|
407 |
+
- -4.999994277954102
|
408 |
+
- -4.999994277954102
|
409 |
+
- -4.999994277954102
|
410 |
+
- -4.999994277954102
|
411 |
+
- -4.999994277954102
|
412 |
+
- -4.999994277954102
|
413 |
+
- -4.999994277954102
|
414 |
+
- -4.999994277954102
|
415 |
+
- -4.999994277954102
|
416 |
+
- -4.999994277954102
|
417 |
+
- -4.999994277954102
|
418 |
+
- -4.999994277954102
|
419 |
+
- -4.989471912384033
|
420 |
+
- -4.999994277954102
|
421 |
+
spk_cond_steps: []
|
422 |
+
stop_token_weight: 5.0
|
423 |
+
task_cls: training.task.SVC_task.SVCTask
|
424 |
+
test_ids: []
|
425 |
+
test_input_dir: ''
|
426 |
+
test_num: 0
|
427 |
+
test_prefixes:
|
428 |
+
- test
|
429 |
+
test_set_name: test
|
430 |
+
timesteps: 1000
|
431 |
+
train_set_name: train
|
432 |
+
use_cn_hubert: false
|
433 |
+
use_crepe: true
|
434 |
+
use_denoise: false
|
435 |
+
use_energy_embed: false
|
436 |
+
use_gt_dur: false
|
437 |
+
use_gt_f0: false
|
438 |
+
use_midi: false
|
439 |
+
use_nsf: true
|
440 |
+
use_pitch_embed: true
|
441 |
+
use_pos_embed: true
|
442 |
+
use_spk_embed: false
|
443 |
+
use_spk_id: false
|
444 |
+
use_split_spk_id: false
|
445 |
+
use_uv: false
|
446 |
+
use_var_enc: false
|
447 |
+
use_vec: false
|
448 |
+
val_check_interval: 2000
|
449 |
+
valid_num: 0
|
450 |
+
valid_set_name: valid
|
451 |
+
vocoder: network.vocoders.nsf_hifigan.NsfHifiGAN
|
452 |
+
vocoder_ckpt: checkpoints/nsf_hifigan/model
|
453 |
+
warmup_updates: 2000
|
454 |
+
wav2spec_eps: 1e-6
|
455 |
+
weight_decay: 0
|
456 |
+
win_size: 2048
|
457 |
+
work_dir: checkpoints/aquapre
|
checkpoints/0102_xiaoma_pe/config.yaml
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
audio_num_mel_bins: 80
|
3 |
+
audio_sample_rate: 24000
|
4 |
+
base_config:
|
5 |
+
- configs/tts/lj/fs2.yaml
|
6 |
+
binarization_args:
|
7 |
+
shuffle: false
|
8 |
+
with_align: true
|
9 |
+
with_f0: true
|
10 |
+
with_f0cwt: true
|
11 |
+
with_spk_embed: true
|
12 |
+
with_txt: true
|
13 |
+
with_wav: false
|
14 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
15 |
+
binary_data_dir: data/binary/xiaoma1022_24k_128hop
|
16 |
+
check_val_every_n_epoch: 10
|
17 |
+
clip_grad_norm: 1
|
18 |
+
cwt_add_f0_loss: false
|
19 |
+
cwt_hidden_size: 128
|
20 |
+
cwt_layers: 2
|
21 |
+
cwt_loss: l1
|
22 |
+
cwt_std_scale: 0.8
|
23 |
+
debug: false
|
24 |
+
dec_ffn_kernel_size: 9
|
25 |
+
dec_layers: 4
|
26 |
+
decoder_type: fft
|
27 |
+
dict_dir: ''
|
28 |
+
dropout: 0.1
|
29 |
+
ds_workers: 4
|
30 |
+
dur_enc_hidden_stride_kernel:
|
31 |
+
- 0,2,3
|
32 |
+
- 0,2,3
|
33 |
+
- 0,1,3
|
34 |
+
dur_loss: mse
|
35 |
+
dur_predictor_kernel: 3
|
36 |
+
dur_predictor_layers: 2
|
37 |
+
enc_ffn_kernel_size: 9
|
38 |
+
enc_layers: 4
|
39 |
+
encoder_K: 8
|
40 |
+
encoder_type: fft
|
41 |
+
endless_ds: true
|
42 |
+
ffn_act: gelu
|
43 |
+
ffn_padding: SAME
|
44 |
+
fft_size: 512
|
45 |
+
fmax: 12000
|
46 |
+
fmin: 30
|
47 |
+
gen_dir_name: ''
|
48 |
+
hidden_size: 256
|
49 |
+
hop_size: 128
|
50 |
+
infer: false
|
51 |
+
lambda_commit: 0.25
|
52 |
+
lambda_energy: 0.1
|
53 |
+
lambda_f0: 1.0
|
54 |
+
lambda_ph_dur: 1.0
|
55 |
+
lambda_sent_dur: 1.0
|
56 |
+
lambda_uv: 1.0
|
57 |
+
lambda_word_dur: 1.0
|
58 |
+
load_ckpt: ''
|
59 |
+
log_interval: 100
|
60 |
+
loud_norm: false
|
61 |
+
lr: 2.0
|
62 |
+
max_epochs: 1000
|
63 |
+
max_eval_sentences: 1
|
64 |
+
max_eval_tokens: 60000
|
65 |
+
max_frames: 5000
|
66 |
+
max_input_tokens: 1550
|
67 |
+
max_sentences: 100000
|
68 |
+
max_tokens: 20000
|
69 |
+
max_updates: 60000
|
70 |
+
mel_loss: l1
|
71 |
+
mel_vmax: 1.5
|
72 |
+
mel_vmin: -6
|
73 |
+
min_level_db: -120
|
74 |
+
norm_type: gn
|
75 |
+
num_ckpt_keep: 3
|
76 |
+
num_heads: 2
|
77 |
+
num_sanity_val_steps: 5
|
78 |
+
num_spk: 1
|
79 |
+
num_test_samples: 20
|
80 |
+
num_valid_plots: 10
|
81 |
+
optimizer_adam_beta1: 0.9
|
82 |
+
optimizer_adam_beta2: 0.98
|
83 |
+
out_wav_norm: false
|
84 |
+
pitch_ar: false
|
85 |
+
pitch_enc_hidden_stride_kernel:
|
86 |
+
- 0,2,5
|
87 |
+
- 0,2,5
|
88 |
+
- 0,2,5
|
89 |
+
pitch_extractor_conv_layers: 2
|
90 |
+
pitch_loss: l1
|
91 |
+
pitch_norm: log
|
92 |
+
pitch_type: frame
|
93 |
+
pre_align_args:
|
94 |
+
allow_no_txt: false
|
95 |
+
denoise: false
|
96 |
+
forced_align: mfa
|
97 |
+
txt_processor: en
|
98 |
+
use_sox: false
|
99 |
+
use_tone: true
|
100 |
+
pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
|
101 |
+
predictor_dropout: 0.5
|
102 |
+
predictor_grad: 0.1
|
103 |
+
predictor_hidden: -1
|
104 |
+
predictor_kernel: 5
|
105 |
+
predictor_layers: 2
|
106 |
+
prenet_dropout: 0.5
|
107 |
+
prenet_hidden_size: 256
|
108 |
+
pretrain_fs_ckpt: ''
|
109 |
+
processed_data_dir: data/processed/ljspeech
|
110 |
+
profile_infer: false
|
111 |
+
raw_data_dir: data/raw/LJSpeech-1.1
|
112 |
+
ref_norm_layer: bn
|
113 |
+
reset_phone_dict: true
|
114 |
+
save_best: false
|
115 |
+
save_ckpt: true
|
116 |
+
save_codes:
|
117 |
+
- configs
|
118 |
+
- modules
|
119 |
+
- tasks
|
120 |
+
- utils
|
121 |
+
- usr
|
122 |
+
save_f0: false
|
123 |
+
save_gt: false
|
124 |
+
seed: 1234
|
125 |
+
sort_by_len: true
|
126 |
+
stop_token_weight: 5.0
|
127 |
+
task_cls: tasks.tts.pe.PitchExtractionTask
|
128 |
+
test_ids:
|
129 |
+
- 68
|
130 |
+
- 70
|
131 |
+
- 74
|
132 |
+
- 87
|
133 |
+
- 110
|
134 |
+
- 172
|
135 |
+
- 190
|
136 |
+
- 215
|
137 |
+
- 231
|
138 |
+
- 294
|
139 |
+
- 316
|
140 |
+
- 324
|
141 |
+
- 402
|
142 |
+
- 422
|
143 |
+
- 485
|
144 |
+
- 500
|
145 |
+
- 505
|
146 |
+
- 508
|
147 |
+
- 509
|
148 |
+
- 519
|
149 |
+
test_input_dir: ''
|
150 |
+
test_num: 523
|
151 |
+
test_set_name: test
|
152 |
+
train_set_name: train
|
153 |
+
use_denoise: false
|
154 |
+
use_energy_embed: false
|
155 |
+
use_gt_dur: false
|
156 |
+
use_gt_f0: false
|
157 |
+
use_pitch_embed: true
|
158 |
+
use_pos_embed: true
|
159 |
+
use_spk_embed: false
|
160 |
+
use_spk_id: false
|
161 |
+
use_split_spk_id: false
|
162 |
+
use_uv: true
|
163 |
+
use_var_enc: false
|
164 |
+
val_check_interval: 2000
|
165 |
+
valid_num: 348
|
166 |
+
valid_set_name: valid
|
167 |
+
vocoder: pwg
|
168 |
+
vocoder_ckpt: ''
|
169 |
+
warmup_updates: 2000
|
170 |
+
weight_decay: 0
|
171 |
+
win_size: 512
|
172 |
+
work_dir: checkpoints/0102_xiaoma_pe
|
checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1863f12324e43783089ab933edeeb969106b851e30d71019ebbaa9b82099d82a
|
3 |
+
size 39141959
|
checkpoints/hubert/hubert.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c72bad89da99152077bf8157ff75beca7c6dc966ea01a6a0fb3777f99e77aa9b
|
3 |
+
size 378353321
|
checkpoints/hubert/hubert_soft.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e82e7d079df05fe3aa535f6f7d42d309bdae1d2a53324e2b2386c56721f4f649
|
3 |
+
size 378435957
|
checkpoints/nsf_hifigan/NOTICE.txt
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--- DiffSinger Community Vocoder ---
|
2 |
+
|
3 |
+
ARCHITECTURE: NSF-HiFiGAN
|
4 |
+
RELEASE DATE: 2022-12-11
|
5 |
+
|
6 |
+
HYPER PARAMETERS:
|
7 |
+
- 44100 sample rate
|
8 |
+
- 128 mel bins
|
9 |
+
- 512 hop size
|
10 |
+
- 2048 window size
|
11 |
+
- fmin at 40Hz
|
12 |
+
- fmax at 16000Hz
|
13 |
+
|
14 |
+
|
15 |
+
NOTICE:
|
16 |
+
|
17 |
+
All model weights in the [DiffSinger Community Vocoder Project](https://openvpi.github.io/vocoders/), including
|
18 |
+
model weights in this directory, are provided by the [OpenVPI Team](https://github.com/openvpi/), under the
|
19 |
+
[Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/) license.
|
20 |
+
|
21 |
+
|
22 |
+
ACKNOWLEDGEMENTS:
|
23 |
+
|
24 |
+
Training data of this vocoder is provided and permitted by the following organizations, societies and individuals:
|
25 |
+
|
26 |
+
孙飒 https://www.qfssr.cn
|
27 |
+
赤松_Akamatsu https://www.zhibin.club
|
28 |
+
乐威 https://www.zhibin.club
|
29 |
+
伯添 https://space.bilibili.com/24087011
|
30 |
+
雲宇光 https://space.bilibili.com/660675050
|
31 |
+
橙子言 https://space.bilibili.com/318486464
|
32 |
+
人衣大人 https://space.bilibili.com/2270344
|
33 |
+
玖蝶 https://space.bilibili.com/676771003
|
34 |
+
Yuuko
|
35 |
+
白夜零BYL https://space.bilibili.com/1605040503
|
36 |
+
嗷天 https://space.bilibili.com/5675252
|
37 |
+
洛泠羽 https://space.bilibili.com/347373318
|
38 |
+
灰条纹的灰猫君 https://space.bilibili.com/2083633
|
39 |
+
幽寂 https://space.bilibili.com/478860
|
40 |
+
恶魔王女 https://space.bilibili.com/2475098
|
41 |
+
AlexYHX 芮晴
|
42 |
+
绮萱 https://y.qq.com/n/ryqq/singer/003HjD6H4aZn1K
|
43 |
+
诗芸 https://y.qq.com/n/ryqq/singer/0005NInj142zm0
|
44 |
+
汐蕾 https://y.qq.com/n/ryqq/singer/0023cWMH1Bq1PJ
|
45 |
+
1262917464
|
46 |
+
炜阳
|
47 |
+
叶卡yolka
|
48 |
+
幸の夏 https://space.bilibili.com/1017297686
|
49 |
+
暮色未量 https://space.bilibili.com/272904686
|
50 |
+
晓寞sama https://space.bilibili.com/3463394
|
51 |
+
没头绪的节操君
|
52 |
+
串串BunC https://space.bilibili.com/95817834
|
53 |
+
落雨 https://space.bilibili.com/1292427
|
54 |
+
长尾巴的翎艾 https://space.bilibili.com/1638666
|
55 |
+
声闻计划 https://space.bilibili.com/392812269
|
56 |
+
唐家大小姐 http://5sing.kugou.com/palmusic/default.html
|
57 |
+
不伊子
|
58 |
+
|
59 |
+
Training machines are provided by:
|
60 |
+
|
61 |
+
花儿不哭 https://space.bilibili.com/5760446
|
62 |
+
|
63 |
+
|
64 |
+
TERMS OF REDISTRIBUTIONS:
|
65 |
+
|
66 |
+
1. Do not sell this vocoder, or charge any fees from redistributing it, as prohibited by
|
67 |
+
the license.
|
68 |
+
2. Include a copy of the CC BY-NC-SA 4.0 license, or a link referring to it.
|
69 |
+
3. Include a copy of this notice, or any other notices informing that this vocoder is
|
70 |
+
provided by the OpenVPI Team, that this vocoder is licensed under CC BY-NC-SA 4.0, and
|
71 |
+
with a complete acknowledgement list as shown above.
|
72 |
+
4. If you fine-tuned or modified the weights, leave a notice about what has been changed.
|
73 |
+
5. (Optional) Leave a link to the official release page of the vocoder, and tell users
|
74 |
+
that other versions and future updates of this vocoder can be obtained from the website.
|
checkpoints/nsf_hifigan/config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"resblock": "1",
|
3 |
+
"num_gpus": 4,
|
4 |
+
"batch_size": 10,
|
5 |
+
"learning_rate": 0.0002,
|
6 |
+
"adam_b1": 0.8,
|
7 |
+
"adam_b2": 0.99,
|
8 |
+
"lr_decay": 0.999,
|
9 |
+
"seed": 1234,
|
10 |
+
|
11 |
+
"upsample_rates": [ 8, 8, 2, 2, 2],
|
12 |
+
"upsample_kernel_sizes": [16,16, 4, 4, 4],
|
13 |
+
"upsample_initial_channel": 512,
|
14 |
+
"resblock_kernel_sizes": [3,7,11],
|
15 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
16 |
+
"discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
|
17 |
+
|
18 |
+
"segment_size": 16384,
|
19 |
+
"num_mels": 128,
|
20 |
+
"num_freq": 1025,
|
21 |
+
"n_fft" : 2048,
|
22 |
+
"hop_size": 512,
|
23 |
+
"win_size": 2048,
|
24 |
+
|
25 |
+
"sampling_rate": 44100,
|
26 |
+
|
27 |
+
"fmin": 40,
|
28 |
+
"fmax": 16000,
|
29 |
+
"fmax_for_loss": null,
|
30 |
+
|
31 |
+
"num_workers": 16,
|
32 |
+
|
33 |
+
"dist_config": {
|
34 |
+
"dist_backend": "nccl",
|
35 |
+
"dist_url": "tcp://localhost:54321",
|
36 |
+
"world_size": 1
|
37 |
+
}
|
38 |
+
}
|
checkpoints/nsf_hifigan/model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c576b63b7ed952161b70fad34e0562ace502ce689195520d8a2a6c051de29d6
|
3 |
+
size 56825430
|
infer.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import soundfile
|
6 |
+
|
7 |
+
from infer_tools import infer_tool
|
8 |
+
from infer_tools import slicer
|
9 |
+
from infer_tools.infer_tool import Svc
|
10 |
+
from utils.hparams import hparams
|
11 |
+
|
12 |
+
|
13 |
+
def run_clip(raw_audio_path, svc_model, key, acc, use_crepe, spk_id=0, auto_key=False, out_path=None, slice_db=-40,
|
14 |
+
**kwargs):
|
15 |
+
print(f'code version:2023-01-22')
|
16 |
+
|
17 |
+
clean_name = Path(raw_audio_path).name.split(".")[0]
|
18 |
+
infer_tool.format_wav(raw_audio_path)
|
19 |
+
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
20 |
+
key = svc_model.evaluate_key(wav_path, key, auto_key)
|
21 |
+
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
22 |
+
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
23 |
+
|
24 |
+
count = 0
|
25 |
+
f0_tst, f0_pred, audio = [], [], []
|
26 |
+
for (slice_tag, data) in audio_data:
|
27 |
+
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
28 |
+
length = int(np.ceil(len(data) / audio_sr * hparams['audio_sample_rate']))
|
29 |
+
raw_path = io.BytesIO()
|
30 |
+
soundfile.write(raw_path, data, audio_sr, format="wav")
|
31 |
+
raw_path.seek(0)
|
32 |
+
if slice_tag:
|
33 |
+
print('jump empty segment')
|
34 |
+
_f0_tst, _f0_pred, _audio = (
|
35 |
+
np.zeros(int(np.ceil(length / hparams['hop_size']))),
|
36 |
+
np.zeros(int(np.ceil(length / hparams['hop_size']))),
|
37 |
+
np.zeros(length))
|
38 |
+
else:
|
39 |
+
_f0_tst, _f0_pred, _audio = svc_model.infer(raw_path, spk_id=spk_id, key=key, acc=acc, use_crepe=use_crepe)
|
40 |
+
fix_audio = np.zeros(length)
|
41 |
+
fix_audio[:] = np.mean(_audio)
|
42 |
+
fix_audio[:len(_audio)] = _audio[0 if len(_audio) < len(fix_audio) else len(_audio) - len(fix_audio):]
|
43 |
+
f0_tst.extend(_f0_tst)
|
44 |
+
f0_pred.extend(_f0_pred)
|
45 |
+
audio.extend(list(fix_audio))
|
46 |
+
count += 1
|
47 |
+
if out_path is None:
|
48 |
+
out_path = f'./results/{clean_name}_{key}key_{project_name}_{hparams["residual_channels"]}_{hparams["residual_layers"]}_{int(step / 1000)}k_{accelerate}x.{kwargs["format"]}'
|
49 |
+
soundfile.write(out_path, audio, hparams["audio_sample_rate"], 'PCM_16', format=out_path.split('.')[-1])
|
50 |
+
return np.array(f0_tst), np.array(f0_pred), audio
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == '__main__':
|
54 |
+
# 工程文件夹名,训练时用的那个
|
55 |
+
project_name = "open-aqua"
|
56 |
+
model_path = f'./checkpoints/{project_name}/model_ckpt_steps_90000.ckpt'
|
57 |
+
config_path = f'./checkpoints/{project_name}/config.yaml'
|
58 |
+
|
59 |
+
# 支持多个wav/ogg文件,放在raw文件夹下,带扩展名
|
60 |
+
file_names = ["横竖撇点折-main-2key.wav"]
|
61 |
+
spk_id = "single"
|
62 |
+
# 自适应变调(仅支持单人模型)
|
63 |
+
auto_key = False
|
64 |
+
trans = [0] # 音高调整,支持正负(半音),数量与上一行对应,不足的自动按第一个移调参数补齐
|
65 |
+
# 加速倍数
|
66 |
+
accelerate = 1
|
67 |
+
hubert_gpu = True
|
68 |
+
wav_format = 'wav'
|
69 |
+
step = int(model_path.split("_")[-1].split(".")[0])
|
70 |
+
|
71 |
+
# 下面不动
|
72 |
+
infer_tool.mkdir(["./raw", "./results"])
|
73 |
+
infer_tool.fill_a_to_b(trans, file_names)
|
74 |
+
|
75 |
+
model = Svc(project_name, config_path, hubert_gpu, model_path, onnx=False)
|
76 |
+
for f_name, tran in zip(file_names, trans):
|
77 |
+
if "." not in f_name:
|
78 |
+
f_name += ".wav"
|
79 |
+
audio_path = f"./raw/{f_name}"
|
80 |
+
run_clip(raw_audio_path=audio_path, svc_model=model, key=tran, acc=accelerate, use_crepe=False,
|
81 |
+
spk_id=spk_id, auto_key=auto_key, project_name=project_name, format=wav_format)
|
infer_tools/__pycache__/f0_static.cpython-38.pyc
ADDED
Binary file (5.12 kB). View file
|
|
infer_tools/__pycache__/infer_tool.cpython-38.pyc
ADDED
Binary file (7.26 kB). View file
|
|
infer_tools/__pycache__/infer_tool_beta.cpython-38.pyc
ADDED
Binary file (7.8 kB). View file
|
|
infer_tools/__pycache__/slicer.cpython-38.pyc
ADDED
Binary file (3.84 kB). View file
|
|
infer_tools/__pycache__/trans_key.cpython-38.pyc
ADDED
Binary file (2 kB). View file
|
|
infer_tools/f0_static.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
from functools import reduce
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import matplotlib
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import yaml
|
10 |
+
from pylab import xticks, np
|
11 |
+
from tqdm import tqdm
|
12 |
+
|
13 |
+
from modules.vocoders.nsf_hifigan import NsfHifiGAN
|
14 |
+
from preprocessing.process_pipeline import get_pitch_parselmouth, get_pitch_crepe
|
15 |
+
from utils.hparams import set_hparams, hparams
|
16 |
+
|
17 |
+
head_list = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
|
18 |
+
|
19 |
+
|
20 |
+
def compare_pitch(f0_static_dict, pitch_time_temp, trans_key=0):
|
21 |
+
return sum({k: v * f0_static_dict[str(k + trans_key)] for k, v in pitch_time_temp.items() if
|
22 |
+
str(k + trans_key) in f0_static_dict}.values())
|
23 |
+
|
24 |
+
|
25 |
+
def f0_to_pitch(ff):
|
26 |
+
f0_pitch = 69 + 12 * np.log2(ff / 440)
|
27 |
+
return round(f0_pitch, 0)
|
28 |
+
|
29 |
+
|
30 |
+
def pitch_to_name(pitch):
|
31 |
+
return f"{head_list[int(pitch % 12)]}{int(pitch / 12) - 1}"
|
32 |
+
|
33 |
+
|
34 |
+
def get_f0(audio_path, crepe=False):
|
35 |
+
wav, mel = NsfHifiGAN.wav2spec(audio_path)
|
36 |
+
if crepe:
|
37 |
+
f0, pitch_coarse = get_pitch_crepe(wav, mel, hparams)
|
38 |
+
else:
|
39 |
+
f0, pitch_coarse = get_pitch_parselmouth(wav, mel, hparams)
|
40 |
+
return f0
|
41 |
+
|
42 |
+
|
43 |
+
def merge_f0_dict(dict_list):
|
44 |
+
def sum_dict(a, b):
|
45 |
+
temp = dict()
|
46 |
+
for key in a.keys() | b.keys():
|
47 |
+
temp[key] = sum([d.get(key, 0) for d in (a, b)])
|
48 |
+
return temp
|
49 |
+
|
50 |
+
return reduce(sum_dict, dict_list)
|
51 |
+
|
52 |
+
|
53 |
+
def collect_f0(f0):
|
54 |
+
pitch_num = {}
|
55 |
+
pitch_list = [f0_to_pitch(x) for x in f0[f0 > 0]]
|
56 |
+
for key in pitch_list:
|
57 |
+
pitch_num[key] = pitch_num.get(key, 0) + 1
|
58 |
+
return pitch_num
|
59 |
+
|
60 |
+
|
61 |
+
def static_f0_time(f0):
|
62 |
+
if isinstance(f0, dict):
|
63 |
+
pitch_num = merge_f0_dict({k: collect_f0(v) for k, v in f0.items()}.values())
|
64 |
+
else:
|
65 |
+
pitch_num = collect_f0(f0)
|
66 |
+
static_pitch_time = {}
|
67 |
+
sort_key = sorted(pitch_num.keys())
|
68 |
+
for key in sort_key:
|
69 |
+
static_pitch_time[key] = round(pitch_num[key] * hparams['hop_size'] / hparams['audio_sample_rate'], 2)
|
70 |
+
return static_pitch_time
|
71 |
+
|
72 |
+
|
73 |
+
def get_end_file(dir_path, end):
|
74 |
+
file_lists = []
|
75 |
+
for root, dirs, files in os.walk(dir_path):
|
76 |
+
files = [f for f in files if f[0] != '.']
|
77 |
+
dirs[:] = [d for d in dirs if d[0] != '.']
|
78 |
+
for f_file in files:
|
79 |
+
if f_file.endswith(end):
|
80 |
+
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
|
81 |
+
return file_lists
|
82 |
+
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
# 给config文件增加f0_static统计音域
|
86 |
+
config_path = "F:/sovits/diff-svc-main/checkpoints/aquapre/config.yaml"
|
87 |
+
hparams = set_hparams(config=config_path, exp_name='', infer=True, reset=True, hparams_str='', print_hparams=False)
|
88 |
+
f0_dict = {}
|
89 |
+
# 获取batch文件夹下所有wav文件
|
90 |
+
wav_paths = get_end_file("F:/sovits/diff-svc-main/batch/aquapre", "wav")
|
91 |
+
# parselmouth获取f0
|
92 |
+
with tqdm(total=len(wav_paths)) as p_bar:
|
93 |
+
p_bar.set_description('Processing')
|
94 |
+
for wav_path in wav_paths:
|
95 |
+
f0_dict[wav_path] = get_f0(wav_path, crepe=False)
|
96 |
+
p_bar.update(1)
|
97 |
+
pitch_time = static_f0_time(f0_dict)
|
98 |
+
total_time = round(sum(pitch_time.values()), 2)
|
99 |
+
pitch_time["total_time"] = total_time
|
100 |
+
print(f"total time: {total_time}s")
|
101 |
+
shutil.copy(config_path, f"{Path(config_path).parent}\\back_{Path(config_path).name}")
|
102 |
+
with open(config_path, encoding='utf-8') as f:
|
103 |
+
_hparams = yaml.safe_load(f)
|
104 |
+
_hparams['f0_static'] = json.dumps(pitch_time)
|
105 |
+
with open(config_path, 'w', encoding='utf-8') as f:
|
106 |
+
yaml.safe_dump(_hparams, f)
|
107 |
+
print("原config文件已在原目录建立备份:back_config.yaml")
|
108 |
+
print("音域统计已保存至config文件,此模型可使用自动变调功能")
|
109 |
+
matplotlib.use('TkAgg')
|
110 |
+
plt.title("数据集音域统计", fontproperties='SimHei')
|
111 |
+
plt.xlabel("音高", fontproperties='SimHei')
|
112 |
+
plt.ylabel("时长(s)", fontproperties='SimHei')
|
113 |
+
xticks_labels = [pitch_to_name(i) for i in range(36, 96)]
|
114 |
+
xticks(np.linspace(36, 96, 60, endpoint=True), xticks_labels)
|
115 |
+
plt.plot(pitch_time.keys(), pitch_time.values(), color='dodgerblue')
|
116 |
+
plt.show()
|
infer_tools/f0_temp.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
infer_tools/infer_tool.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
from io import BytesIO
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import librosa
|
8 |
+
import numpy as np
|
9 |
+
import soundfile
|
10 |
+
import torch
|
11 |
+
|
12 |
+
import utils
|
13 |
+
from infer_tools.f0_static import compare_pitch, static_f0_time
|
14 |
+
from modules.diff.diffusion import GaussianDiffusion
|
15 |
+
from modules.diff.net import DiffNet
|
16 |
+
from modules.vocoders.nsf_hifigan import NsfHifiGAN
|
17 |
+
from preprocessing.hubertinfer import HubertEncoder
|
18 |
+
from preprocessing.process_pipeline import File2Batch, get_pitch_parselmouth
|
19 |
+
from utils.hparams import hparams, set_hparams
|
20 |
+
from utils.pitch_utils import denorm_f0, norm_interp_f0
|
21 |
+
|
22 |
+
|
23 |
+
def timeit(func):
|
24 |
+
def run(*args, **kwargs):
|
25 |
+
t = time.time()
|
26 |
+
res = func(*args, **kwargs)
|
27 |
+
print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
|
28 |
+
return res
|
29 |
+
|
30 |
+
return run
|
31 |
+
|
32 |
+
|
33 |
+
def format_wav(audio_path):
|
34 |
+
if Path(audio_path).suffix == '.wav':
|
35 |
+
return
|
36 |
+
raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
|
37 |
+
soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
|
38 |
+
|
39 |
+
|
40 |
+
def fill_a_to_b(a, b):
|
41 |
+
if len(a) < len(b):
|
42 |
+
for _ in range(0, len(b) - len(a)):
|
43 |
+
a.append(a[0])
|
44 |
+
|
45 |
+
|
46 |
+
def get_end_file(dir_path, end):
|
47 |
+
file_lists = []
|
48 |
+
for root, dirs, files in os.walk(dir_path):
|
49 |
+
files = [f for f in files if f[0] != '.']
|
50 |
+
dirs[:] = [d for d in dirs if d[0] != '.']
|
51 |
+
for f_file in files:
|
52 |
+
if f_file.endswith(end):
|
53 |
+
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
|
54 |
+
return file_lists
|
55 |
+
|
56 |
+
|
57 |
+
def mkdir(paths: list):
|
58 |
+
for path in paths:
|
59 |
+
if not os.path.exists(path):
|
60 |
+
os.mkdir(path)
|
61 |
+
|
62 |
+
|
63 |
+
class Svc:
|
64 |
+
def __init__(self, project_name, config_name, hubert_gpu, model_path, onnx=False):
|
65 |
+
self.project_name = project_name
|
66 |
+
self.DIFF_DECODERS = {
|
67 |
+
'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
|
68 |
+
}
|
69 |
+
|
70 |
+
self.model_path = model_path
|
71 |
+
self.dev = torch.device("cuda")
|
72 |
+
|
73 |
+
self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
|
74 |
+
reset=True, hparams_str='', print_hparams=False)
|
75 |
+
|
76 |
+
hparams['hubert_gpu'] = hubert_gpu
|
77 |
+
self.hubert = HubertEncoder(hparams['hubert_path'], onnx=onnx)
|
78 |
+
self.model = GaussianDiffusion(
|
79 |
+
phone_encoder=self.hubert,
|
80 |
+
out_dims=hparams['audio_num_mel_bins'],
|
81 |
+
denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
|
82 |
+
timesteps=hparams['timesteps'],
|
83 |
+
K_step=hparams['K_step'],
|
84 |
+
loss_type=hparams['diff_loss_type'],
|
85 |
+
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
|
86 |
+
)
|
87 |
+
utils.load_ckpt(self.model, self.model_path, 'model', force=True, strict=True)
|
88 |
+
self.model.cuda()
|
89 |
+
self.vocoder = NsfHifiGAN()
|
90 |
+
|
91 |
+
def infer(self, in_path, key, acc, spk_id=0, use_crepe=True, singer=False):
|
92 |
+
batch = self.pre(in_path, acc, spk_id, use_crepe)
|
93 |
+
batch['f0'] = batch['f0'] + (key / 12)
|
94 |
+
batch['f0'][batch['f0'] > np.log2(hparams['f0_max'])] = 0
|
95 |
+
|
96 |
+
@timeit
|
97 |
+
def diff_infer():
|
98 |
+
spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
|
99 |
+
energy = batch.get('energy').cuda() if batch.get('energy') else None
|
100 |
+
if spk_embed is None:
|
101 |
+
spk_embed = torch.LongTensor([0])
|
102 |
+
diff_outputs = self.model(
|
103 |
+
hubert=batch['hubert'].cuda(), spk_embed_id=spk_embed.cuda(), mel2ph=batch['mel2ph'].cuda(),
|
104 |
+
f0=batch['f0'].cuda(), energy=energy, ref_mels=batch["mels"].cuda(), infer=True)
|
105 |
+
return diff_outputs
|
106 |
+
|
107 |
+
outputs = diff_infer()
|
108 |
+
batch['outputs'] = outputs['mel_out']
|
109 |
+
batch['mel2ph_pred'] = outputs['mel2ph']
|
110 |
+
batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
|
111 |
+
batch['f0_pred'] = outputs.get('f0_denorm')
|
112 |
+
return self.after_infer(batch, singer, in_path)
|
113 |
+
|
114 |
+
@timeit
|
115 |
+
def after_infer(self, prediction, singer, in_path):
|
116 |
+
for k, v in prediction.items():
|
117 |
+
if type(v) is torch.Tensor:
|
118 |
+
prediction[k] = v.cpu().numpy()
|
119 |
+
|
120 |
+
# remove paddings
|
121 |
+
mel_gt = prediction["mels"]
|
122 |
+
mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
|
123 |
+
|
124 |
+
mel_pred = prediction["outputs"]
|
125 |
+
mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
|
126 |
+
mel_pred = mel_pred[mel_pred_mask]
|
127 |
+
mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
|
128 |
+
|
129 |
+
f0_gt = prediction.get("f0_gt")
|
130 |
+
f0_pred = prediction.get("f0_pred")
|
131 |
+
if f0_pred is not None:
|
132 |
+
f0_gt = f0_gt[mel_gt_mask]
|
133 |
+
if len(f0_pred) > len(mel_pred_mask):
|
134 |
+
f0_pred = f0_pred[:len(mel_pred_mask)]
|
135 |
+
f0_pred = f0_pred[mel_pred_mask]
|
136 |
+
torch.cuda.is_available() and torch.cuda.empty_cache()
|
137 |
+
|
138 |
+
if singer:
|
139 |
+
data_path = in_path.replace("batch", "singer_data")
|
140 |
+
mel_path = data_path[:-4] + "_mel.npy"
|
141 |
+
f0_path = data_path[:-4] + "_f0.npy"
|
142 |
+
np.save(mel_path, mel_pred)
|
143 |
+
np.save(f0_path, f0_pred)
|
144 |
+
wav_pred = self.vocoder.spec2wav(mel_pred, f0=f0_pred)
|
145 |
+
return f0_gt, f0_pred, wav_pred
|
146 |
+
|
147 |
+
def pre(self, wav_fn, accelerate, spk_id=0, use_crepe=True):
|
148 |
+
if isinstance(wav_fn, BytesIO):
|
149 |
+
item_name = self.project_name
|
150 |
+
else:
|
151 |
+
song_info = wav_fn.split('/')
|
152 |
+
item_name = song_info[-1].split('.')[-2]
|
153 |
+
temp_dict = {'wav_fn': wav_fn, 'spk_id': spk_id, 'id': 0}
|
154 |
+
|
155 |
+
temp_dict = File2Batch.temporary_dict2processed_input(item_name, temp_dict, self.hubert, infer=True,
|
156 |
+
use_crepe=use_crepe)
|
157 |
+
hparams['pndm_speedup'] = accelerate
|
158 |
+
batch = File2Batch.processed_input2batch([getitem(temp_dict)])
|
159 |
+
return batch
|
160 |
+
|
161 |
+
def evaluate_key(self, wav_path, key, auto_key):
|
162 |
+
if "f0_static" in hparams.keys():
|
163 |
+
f0_static = json.loads(hparams['f0_static'])
|
164 |
+
wav, mel = self.vocoder.wav2spec(wav_path)
|
165 |
+
input_f0 = get_pitch_parselmouth(wav, mel, hparams)[0]
|
166 |
+
pitch_time_temp = static_f0_time(input_f0)
|
167 |
+
eval_dict = {}
|
168 |
+
for trans_key in range(-12, 12):
|
169 |
+
eval_dict[trans_key] = compare_pitch(f0_static, pitch_time_temp, trans_key=trans_key)
|
170 |
+
sort_key = sorted(eval_dict, key=eval_dict.get, reverse=True)[:5]
|
171 |
+
print(f"推荐移调:{sort_key}")
|
172 |
+
if auto_key:
|
173 |
+
print(f"自动变调已启用,您的输入key被{sort_key[0]}key覆盖,控制参数为auto_key")
|
174 |
+
return sort_key[0]
|
175 |
+
else:
|
176 |
+
print("config缺少f0_staic,无法使用自动变调,可通过infer_tools/data_static添加")
|
177 |
+
return key
|
178 |
+
|
179 |
+
|
180 |
+
def getitem(item):
|
181 |
+
max_frames = hparams['max_frames']
|
182 |
+
spec = torch.Tensor(item['mel'])[:max_frames]
|
183 |
+
mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None
|
184 |
+
f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
|
185 |
+
hubert = torch.Tensor(item['hubert'][:hparams['max_input_tokens']])
|
186 |
+
pitch = torch.LongTensor(item.get("pitch"))[:max_frames]
|
187 |
+
sample = {
|
188 |
+
"id": item['id'],
|
189 |
+
"spk_id": item['spk_id'],
|
190 |
+
"item_name": item['item_name'],
|
191 |
+
"hubert": hubert,
|
192 |
+
"mel": spec,
|
193 |
+
"pitch": pitch,
|
194 |
+
"f0": f0,
|
195 |
+
"uv": uv,
|
196 |
+
"mel2ph": mel2ph,
|
197 |
+
"mel_nonpadding": spec.abs().sum(-1) > 0,
|
198 |
+
}
|
199 |
+
if hparams['use_energy_embed']:
|
200 |
+
sample['energy'] = item['energy']
|
201 |
+
return sample
|
infer_tools/infer_tool_beta.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
from io import BytesIO
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import librosa
|
8 |
+
import numpy as np
|
9 |
+
import soundfile
|
10 |
+
import torch
|
11 |
+
|
12 |
+
import utils
|
13 |
+
from infer_tools.f0_static import compare_pitch, static_f0_time
|
14 |
+
from modules.diff.diffusion import GaussianDiffusion
|
15 |
+
from modules.diff.net import DiffNet
|
16 |
+
from modules.vocoders.nsf_hifigan import NsfHifiGAN
|
17 |
+
from preprocessing.hubertinfer import HubertEncoder
|
18 |
+
from preprocessing.process_pipeline import File2Batch, get_pitch_parselmouth
|
19 |
+
from utils.hparams import hparams, set_hparams
|
20 |
+
from utils.pitch_utils import denorm_f0, norm_interp_f0
|
21 |
+
|
22 |
+
|
23 |
+
def timeit(func):
|
24 |
+
def run(*args, **kwargs):
|
25 |
+
t = time.time()
|
26 |
+
res = func(*args, **kwargs)
|
27 |
+
print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
|
28 |
+
return res
|
29 |
+
|
30 |
+
return run
|
31 |
+
|
32 |
+
|
33 |
+
def format_wav(audio_path):
|
34 |
+
if Path(audio_path).suffix == '.wav':
|
35 |
+
return
|
36 |
+
raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
|
37 |
+
soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
|
38 |
+
|
39 |
+
|
40 |
+
def fill_a_to_b(a, b):
|
41 |
+
if len(a) < len(b):
|
42 |
+
for _ in range(0, len(b) - len(a)):
|
43 |
+
a.append(a[0])
|
44 |
+
|
45 |
+
|
46 |
+
def get_end_file(dir_path, end):
|
47 |
+
file_lists = []
|
48 |
+
for root, dirs, files in os.walk(dir_path):
|
49 |
+
files = [f for f in files if f[0] != '.']
|
50 |
+
dirs[:] = [d for d in dirs if d[0] != '.']
|
51 |
+
for f_file in files:
|
52 |
+
if f_file.endswith(end):
|
53 |
+
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
|
54 |
+
return file_lists
|
55 |
+
|
56 |
+
|
57 |
+
def mkdir(paths: list):
|
58 |
+
for path in paths:
|
59 |
+
if not os.path.exists(path):
|
60 |
+
os.mkdir(path)
|
61 |
+
|
62 |
+
|
63 |
+
class Svcb:
|
64 |
+
def __init__(self, project_name, config_name, hubert_gpu, model_path, onnx=False):
|
65 |
+
self.project_name = project_name
|
66 |
+
self.DIFF_DECODERS = {
|
67 |
+
'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
|
68 |
+
}
|
69 |
+
|
70 |
+
self.model_path = model_path
|
71 |
+
self.dev = torch.device("cuda")
|
72 |
+
|
73 |
+
self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
|
74 |
+
reset=True, hparams_str='', print_hparams=False)
|
75 |
+
|
76 |
+
self.mel_bins = hparams['audio_num_mel_bins']
|
77 |
+
hparams['hubert_gpu'] = hubert_gpu
|
78 |
+
self.hubert = HubertEncoder(hparams['hubert_path'], onnx=onnx)
|
79 |
+
self.model = GaussianDiffusion(
|
80 |
+
phone_encoder=self.hubert,
|
81 |
+
out_dims=self.mel_bins, denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
|
82 |
+
timesteps=hparams['timesteps'],
|
83 |
+
K_step=hparams['K_step'],
|
84 |
+
loss_type=hparams['diff_loss_type'],
|
85 |
+
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
|
86 |
+
)
|
87 |
+
utils.load_ckpt(self.model, self.model_path, 'model', force=True, strict=True)
|
88 |
+
self.model.cuda()
|
89 |
+
self.vocoder = NsfHifiGAN()
|
90 |
+
|
91 |
+
# def process_batch_f0(batch_f0, hparams):
|
92 |
+
# pitch_num = collect_f0(batch_f0)
|
93 |
+
# pitch_time = {}
|
94 |
+
# sort_key = sorted(pitch_num.keys())
|
95 |
+
# for key in sort_key:
|
96 |
+
# pitch_time[key] = round(pitch_num[key] * hparams['hop_size'] / hparams['audio_sample_rate'], 2)
|
97 |
+
# return pitch_time
|
98 |
+
|
99 |
+
def infer_autokey(self, in_path, key, acc, spk_id=0, use_crepe=False):
|
100 |
+
batch, temp_dict = self.pre(in_path, acc, spk_id, use_crepe)
|
101 |
+
input_f0 = temp_dict['f0']
|
102 |
+
if "f0_static" in hparams.keys():
|
103 |
+
f0_static = json.loads(hparams['f0_static'])
|
104 |
+
pitch_time_temp = static_f0_time(input_f0)
|
105 |
+
eval_dict = {}
|
106 |
+
for trans_key in range(-12, 12):
|
107 |
+
eval_dict[trans_key] = compare_pitch(f0_static, pitch_time_temp, trans_key=trans_key)
|
108 |
+
sort_key = sorted(eval_dict, key=eval_dict.get, reverse=True)[:5]
|
109 |
+
print(f"推荐移调:{sort_key}")
|
110 |
+
print(f"自动变调已启用,您的输入key被{sort_key[0]}key覆盖,控制参数为auto_key")
|
111 |
+
if sort_key[0] > 6:
|
112 |
+
key = sort_key[0] + 6
|
113 |
+
else:
|
114 |
+
key = sort_key[0]
|
115 |
+
return key, in_path, batch
|
116 |
+
|
117 |
+
# def infer(self, in_path, key, acc, spk_id=0, use_crepe=True, singer=False):
|
118 |
+
# batch = self.pre(in_path, acc, spk_id, use_crepe)
|
119 |
+
|
120 |
+
def infer(self, in_path, key, batch, singer=False):
|
121 |
+
batch['f0'] = batch['f0'] + (key / 12)
|
122 |
+
batch['f0'][batch['f0'] > np.log2(hparams['f0_max'])] = 0
|
123 |
+
|
124 |
+
@timeit
|
125 |
+
def diff_infer():
|
126 |
+
spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
|
127 |
+
energy = batch.get('energy').cuda() if batch.get('energy') else None
|
128 |
+
if spk_embed is None:
|
129 |
+
spk_embed = torch.LongTensor([0])
|
130 |
+
diff_outputs = self.model(
|
131 |
+
hubert=batch['hubert'].cuda(), spk_embed_id=spk_embed.cuda(), mel2ph=batch['mel2ph'].cuda(),
|
132 |
+
f0=batch['f0'].cuda(), energy=energy, ref_mels=batch["mels"].cuda(), infer=True)
|
133 |
+
return diff_outputs
|
134 |
+
|
135 |
+
outputs = diff_infer()
|
136 |
+
batch['outputs'] = outputs['mel_out']
|
137 |
+
batch['mel2ph_pred'] = outputs['mel2ph']
|
138 |
+
batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
|
139 |
+
batch['f0_pred'] = outputs.get('f0_denorm')
|
140 |
+
return self.after_infer(batch, singer, in_path)
|
141 |
+
|
142 |
+
@timeit
|
143 |
+
def after_infer(self, prediction, singer, in_path):
|
144 |
+
for k, v in prediction.items():
|
145 |
+
if type(v) is torch.Tensor:
|
146 |
+
prediction[k] = v.cpu().numpy()
|
147 |
+
|
148 |
+
# remove paddings
|
149 |
+
mel_gt = prediction["mels"]
|
150 |
+
mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
|
151 |
+
|
152 |
+
mel_pred = prediction["outputs"]
|
153 |
+
mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
|
154 |
+
mel_pred = mel_pred[mel_pred_mask]
|
155 |
+
mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
|
156 |
+
|
157 |
+
f0_gt = prediction.get("f0_gt")
|
158 |
+
f0_pred = prediction.get("f0_pred")
|
159 |
+
if f0_pred is not None:
|
160 |
+
f0_gt = f0_gt[mel_gt_mask]
|
161 |
+
if len(f0_pred) > len(mel_pred_mask):
|
162 |
+
f0_pred = f0_pred[:len(mel_pred_mask)]
|
163 |
+
f0_pred = f0_pred[mel_pred_mask]
|
164 |
+
torch.cuda.is_available() and torch.cuda.empty_cache()
|
165 |
+
|
166 |
+
if singer:
|
167 |
+
data_path = in_path.replace("batch", "singer_data")
|
168 |
+
mel_path = data_path[:-4] + "_mel.npy"
|
169 |
+
f0_path = data_path[:-4] + "_f0.npy"
|
170 |
+
np.save(mel_path, mel_pred)
|
171 |
+
np.save(f0_path, f0_pred)
|
172 |
+
wav_pred = self.vocoder.spec2wav(mel_pred, f0=f0_pred)
|
173 |
+
return f0_gt, f0_pred, wav_pred
|
174 |
+
|
175 |
+
def pre(self, wav_fn, accelerate, spk_id=0, use_crepe=True):
|
176 |
+
if isinstance(wav_fn, BytesIO):
|
177 |
+
item_name = self.project_name
|
178 |
+
else:
|
179 |
+
song_info = wav_fn.split('/')
|
180 |
+
item_name = song_info[-1].split('.')[-2]
|
181 |
+
temp_dict = {'wav_fn': wav_fn, 'spk_id': spk_id, 'id': 0}
|
182 |
+
|
183 |
+
temp_dict = File2Batch.temporary_dict2processed_input(item_name, temp_dict, self.hubert, infer=True,
|
184 |
+
use_crepe=use_crepe)
|
185 |
+
hparams['pndm_speedup'] = accelerate
|
186 |
+
batch = File2Batch.processed_input2batch([getitem(temp_dict)])
|
187 |
+
return batch, temp_dict
|
188 |
+
|
189 |
+
def evaluate_key(self, wav_path, key, auto_key):
|
190 |
+
if "f0_static" in hparams.keys():
|
191 |
+
f0_static = json.loads(hparams['f0_static'])
|
192 |
+
wav, mel = self.vocoder.wav2spec(wav_path)
|
193 |
+
input_f0 = get_pitch_parselmouth(wav, mel, hparams)[0]
|
194 |
+
pitch_time_temp = static_f0_time(input_f0)
|
195 |
+
eval_dict = {}
|
196 |
+
for trans_key in range(-12, 12):
|
197 |
+
eval_dict[trans_key] = compare_pitch(f0_static, pitch_time_temp, trans_key=trans_key)
|
198 |
+
sort_key = sorted(eval_dict, key=eval_dict.get, reverse=True)[:5]
|
199 |
+
print(f"推荐移调:{sort_key}")
|
200 |
+
if auto_key:
|
201 |
+
print(f"自动变调已启用,您的输入key被{sort_key[0]}key覆盖,控制参数为auto_key")
|
202 |
+
return sort_key[0]
|
203 |
+
else:
|
204 |
+
print("config缺少f0_staic,无法使用自动变调,可通过infer_tools/data_static添加")
|
205 |
+
return key
|
206 |
+
|
207 |
+
|
208 |
+
def getitem(item):
|
209 |
+
max_frames = hparams['max_frames']
|
210 |
+
spec = torch.Tensor(item['mel'])[:max_frames]
|
211 |
+
mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None
|
212 |
+
f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
|
213 |
+
hubert = torch.Tensor(item['hubert'][:hparams['max_input_tokens']])
|
214 |
+
pitch = torch.LongTensor(item.get("pitch"))[:max_frames]
|
215 |
+
sample = {
|
216 |
+
"id": item['id'],
|
217 |
+
"spk_id": item['spk_id'],
|
218 |
+
"item_name": item['item_name'],
|
219 |
+
"hubert": hubert,
|
220 |
+
"mel": spec,
|
221 |
+
"pitch": pitch,
|
222 |
+
"f0": f0,
|
223 |
+
"uv": uv,
|
224 |
+
"mel2ph": mel2ph,
|
225 |
+
"mel_nonpadding": spec.abs().sum(-1) > 0,
|
226 |
+
}
|
227 |
+
if hparams['use_energy_embed']:
|
228 |
+
sample['energy'] = item['energy']
|
229 |
+
return sample
|
infer_tools/slicer.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import torch
|
3 |
+
import torchaudio
|
4 |
+
|
5 |
+
|
6 |
+
class Slicer:
|
7 |
+
def __init__(self,
|
8 |
+
sr: int,
|
9 |
+
threshold: float = -40.,
|
10 |
+
min_length: int = 5000,
|
11 |
+
min_interval: int = 300,
|
12 |
+
hop_size: int = 20,
|
13 |
+
max_sil_kept: int = 5000):
|
14 |
+
if not min_length >= min_interval >= hop_size:
|
15 |
+
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
|
16 |
+
if not max_sil_kept >= hop_size:
|
17 |
+
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
|
18 |
+
min_interval = sr * min_interval / 1000
|
19 |
+
self.threshold = 10 ** (threshold / 20.)
|
20 |
+
self.hop_size = round(sr * hop_size / 1000)
|
21 |
+
self.win_size = min(round(min_interval), 4 * self.hop_size)
|
22 |
+
self.min_length = round(sr * min_length / 1000 / self.hop_size)
|
23 |
+
self.min_interval = round(min_interval / self.hop_size)
|
24 |
+
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
|
25 |
+
|
26 |
+
def _apply_slice(self, waveform, begin, end):
|
27 |
+
if len(waveform.shape) > 1:
|
28 |
+
return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
|
29 |
+
else:
|
30 |
+
return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
|
31 |
+
|
32 |
+
# @timeit
|
33 |
+
def slice(self, waveform):
|
34 |
+
if len(waveform.shape) > 1:
|
35 |
+
samples = librosa.to_mono(waveform)
|
36 |
+
else:
|
37 |
+
samples = waveform
|
38 |
+
if samples.shape[0] <= self.min_length:
|
39 |
+
return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
|
40 |
+
rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
|
41 |
+
sil_tags = []
|
42 |
+
silence_start = None
|
43 |
+
clip_start = 0
|
44 |
+
for i, rms in enumerate(rms_list):
|
45 |
+
# Keep looping while frame is silent.
|
46 |
+
if rms < self.threshold:
|
47 |
+
# Record start of silent frames.
|
48 |
+
if silence_start is None:
|
49 |
+
silence_start = i
|
50 |
+
continue
|
51 |
+
# Keep looping while frame is not silent and silence start has not been recorded.
|
52 |
+
if silence_start is None:
|
53 |
+
continue
|
54 |
+
# Clear recorded silence start if interval is not enough or clip is too short
|
55 |
+
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
|
56 |
+
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
|
57 |
+
if not is_leading_silence and not need_slice_middle:
|
58 |
+
silence_start = None
|
59 |
+
continue
|
60 |
+
# Need slicing. Record the range of silent frames to be removed.
|
61 |
+
if i - silence_start <= self.max_sil_kept:
|
62 |
+
pos = rms_list[silence_start: i + 1].argmin() + silence_start
|
63 |
+
if silence_start == 0:
|
64 |
+
sil_tags.append((0, pos))
|
65 |
+
else:
|
66 |
+
sil_tags.append((pos, pos))
|
67 |
+
clip_start = pos
|
68 |
+
elif i - silence_start <= self.max_sil_kept * 2:
|
69 |
+
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
|
70 |
+
pos += i - self.max_sil_kept
|
71 |
+
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
|
72 |
+
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
|
73 |
+
if silence_start == 0:
|
74 |
+
sil_tags.append((0, pos_r))
|
75 |
+
clip_start = pos_r
|
76 |
+
else:
|
77 |
+
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
|
78 |
+
clip_start = max(pos_r, pos)
|
79 |
+
else:
|
80 |
+
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
|
81 |
+
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
|
82 |
+
if silence_start == 0:
|
83 |
+
sil_tags.append((0, pos_r))
|
84 |
+
else:
|
85 |
+
sil_tags.append((pos_l, pos_r))
|
86 |
+
clip_start = pos_r
|
87 |
+
silence_start = None
|
88 |
+
# Deal with trailing silence.
|
89 |
+
total_frames = rms_list.shape[0]
|
90 |
+
if silence_start is not None and total_frames - silence_start >= self.min_interval:
|
91 |
+
silence_end = min(total_frames, silence_start + self.max_sil_kept)
|
92 |
+
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
|
93 |
+
sil_tags.append((pos, total_frames + 1))
|
94 |
+
# Apply and return slices.
|
95 |
+
if len(sil_tags) == 0:
|
96 |
+
return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
|
97 |
+
else:
|
98 |
+
chunks = []
|
99 |
+
# 第一段静音并非从头开始,补上有声片段
|
100 |
+
if sil_tags[0][0]:
|
101 |
+
chunks.append(
|
102 |
+
{"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
|
103 |
+
for i in range(0, len(sil_tags)):
|
104 |
+
# 标识有声片段(跳过第一段)
|
105 |
+
if i:
|
106 |
+
chunks.append({"slice": False,
|
107 |
+
"split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
|
108 |
+
# 标识所有静音片段
|
109 |
+
chunks.append({"slice": True,
|
110 |
+
"split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
|
111 |
+
# 最后一段静音并非结尾,补上结尾片段
|
112 |
+
if sil_tags[-1][1] * self.hop_size < len(waveform):
|
113 |
+
chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
|
114 |
+
chunk_dict = {}
|
115 |
+
for i in range(len(chunks)):
|
116 |
+
chunk_dict[str(i)] = chunks[i]
|
117 |
+
return chunk_dict
|
118 |
+
|
119 |
+
|
120 |
+
def cut(audio_path, db_thresh=-30, min_len=5000):
|
121 |
+
audio, sr = librosa.load(audio_path, sr=None)
|
122 |
+
slicer = Slicer(
|
123 |
+
sr=sr,
|
124 |
+
threshold=db_thresh,
|
125 |
+
min_length=min_len
|
126 |
+
)
|
127 |
+
chunks = slicer.slice(audio)
|
128 |
+
return chunks
|
129 |
+
|
130 |
+
|
131 |
+
def chunks2audio(audio_path, chunks):
|
132 |
+
chunks = dict(chunks)
|
133 |
+
audio, sr = torchaudio.load(audio_path)
|
134 |
+
if len(audio.shape) == 2 and audio.shape[1] >= 2:
|
135 |
+
audio = torch.mean(audio, dim=0).unsqueeze(0)
|
136 |
+
audio = audio.cpu().numpy()[0]
|
137 |
+
result = []
|
138 |
+
for k, v in chunks.items():
|
139 |
+
tag = v["split_time"].split(",")
|
140 |
+
if tag[0] != tag[1]:
|
141 |
+
result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
|
142 |
+
return result, sr
|
infer_tools/trans_key.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
head_list = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
|
3 |
+
|
4 |
+
|
5 |
+
def trans_f0_seq(feature_pit, transform):
|
6 |
+
feature_pit = feature_pit * 2 ** (transform / 12)
|
7 |
+
return round(feature_pit, 1)
|
8 |
+
|
9 |
+
|
10 |
+
def move_key(raw_data, mv_key):
|
11 |
+
head = raw_data[:-1]
|
12 |
+
body = int(raw_data[-1])
|
13 |
+
new_head_index = head_list.index(head) + mv_key
|
14 |
+
while new_head_index < 0:
|
15 |
+
body -= 1
|
16 |
+
new_head_index += 12
|
17 |
+
while new_head_index > 11:
|
18 |
+
body += 1
|
19 |
+
new_head_index -= 12
|
20 |
+
result_data = head_list[new_head_index] + str(body)
|
21 |
+
return result_data
|
22 |
+
|
23 |
+
|
24 |
+
def trans_key(raw_data, key):
|
25 |
+
for i in raw_data:
|
26 |
+
note_seq_list = i["note_seq"].split(" ")
|
27 |
+
new_note_seq_list = []
|
28 |
+
for note_seq in note_seq_list:
|
29 |
+
if note_seq != "rest":
|
30 |
+
new_note_seq = move_key(note_seq, key)
|
31 |
+
new_note_seq_list.append(new_note_seq)
|
32 |
+
else:
|
33 |
+
new_note_seq_list.append(note_seq)
|
34 |
+
i["note_seq"] = " ".join(new_note_seq_list)
|
35 |
+
|
36 |
+
f0_seq_list = i["f0_seq"].split(" ")
|
37 |
+
f0_seq_list = [float(x) for x in f0_seq_list]
|
38 |
+
new_f0_seq_list = []
|
39 |
+
for f0_seq in f0_seq_list:
|
40 |
+
new_f0_seq = trans_f0_seq(f0_seq, key)
|
41 |
+
new_f0_seq_list.append(str(new_f0_seq))
|
42 |
+
i["f0_seq"] = " ".join(new_f0_seq_list)
|
43 |
+
return raw_data
|
44 |
+
|
45 |
+
|
46 |
+
def trans_opencpop(raw_txt, res_txt, key):
|
47 |
+
if os.path.exists(raw_txt):
|
48 |
+
f_w = open(res_txt, "w", encoding='utf-8')
|
49 |
+
with open(raw_txt, "r", encoding='utf-8') as f:
|
50 |
+
raw_data = f.readlines()
|
51 |
+
for raw in raw_data:
|
52 |
+
raw_list = raw.split("|")
|
53 |
+
new_note_seq_list = []
|
54 |
+
for note_seq in raw_list[3].split(" "):
|
55 |
+
if note_seq != "rest":
|
56 |
+
note_seq = note_seq.split("/")[0] if "/" in note_seq else note_seq
|
57 |
+
new_note_seq = move_key(note_seq, key)
|
58 |
+
new_note_seq_list.append(new_note_seq)
|
59 |
+
else:
|
60 |
+
new_note_seq_list.append(note_seq)
|
61 |
+
raw_list[3] = " ".join(new_note_seq_list)
|
62 |
+
f_w.write("|".join(raw_list))
|
63 |
+
f_w.close()
|
64 |
+
print("opencpop标注文件转换完毕")
|
65 |
+
else:
|
66 |
+
print("未发现opencpop标注文件,请检查路径")
|
67 |
+
|
modules/__pycache__/encoder.cpython-310.pyc
ADDED
Binary file (7.19 kB). View file
|
|
modules/__pycache__/encoder.cpython-38.pyc
ADDED
Binary file (7.17 kB). View file
|
|
modules/commons/__pycache__/common_layers.cpython-310.pyc
ADDED
Binary file (18.6 kB). View file
|
|
modules/commons/__pycache__/common_layers.cpython-38.pyc
ADDED
Binary file (18.9 kB). View file
|
|
modules/commons/__pycache__/ssim.cpython-310.pyc
ADDED
Binary file (2.67 kB). View file
|
|
modules/commons/__pycache__/ssim.cpython-38.pyc
ADDED
Binary file (2.68 kB). View file
|
|
modules/commons/common_layers.py
ADDED
@@ -0,0 +1,675 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import torch.onnx.operators
|
6 |
+
from torch import nn
|
7 |
+
from torch.nn import Parameter
|
8 |
+
|
9 |
+
import utils
|
10 |
+
|
11 |
+
|
12 |
+
class Reshape(nn.Module):
|
13 |
+
def __init__(self, *args):
|
14 |
+
super(Reshape, self).__init__()
|
15 |
+
self.shape = args
|
16 |
+
|
17 |
+
def forward(self, x):
|
18 |
+
return x.view(self.shape)
|
19 |
+
|
20 |
+
|
21 |
+
class Permute(nn.Module):
|
22 |
+
def __init__(self, *args):
|
23 |
+
super(Permute, self).__init__()
|
24 |
+
self.args = args
|
25 |
+
|
26 |
+
def forward(self, x):
|
27 |
+
return x.permute(self.args)
|
28 |
+
|
29 |
+
|
30 |
+
class LinearNorm(torch.nn.Module):
|
31 |
+
def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
|
32 |
+
super(LinearNorm, self).__init__()
|
33 |
+
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
34 |
+
|
35 |
+
torch.nn.init.xavier_uniform_(
|
36 |
+
self.linear_layer.weight,
|
37 |
+
gain=torch.nn.init.calculate_gain(w_init_gain))
|
38 |
+
|
39 |
+
def forward(self, x):
|
40 |
+
return self.linear_layer(x)
|
41 |
+
|
42 |
+
|
43 |
+
class ConvNorm(torch.nn.Module):
|
44 |
+
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
|
45 |
+
padding=None, dilation=1, bias=True, w_init_gain='linear'):
|
46 |
+
super(ConvNorm, self).__init__()
|
47 |
+
if padding is None:
|
48 |
+
assert (kernel_size % 2 == 1)
|
49 |
+
padding = int(dilation * (kernel_size - 1) / 2)
|
50 |
+
|
51 |
+
self.conv = torch.nn.Conv1d(in_channels, out_channels,
|
52 |
+
kernel_size=kernel_size, stride=stride,
|
53 |
+
padding=padding, dilation=dilation,
|
54 |
+
bias=bias)
|
55 |
+
|
56 |
+
torch.nn.init.xavier_uniform_(
|
57 |
+
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
58 |
+
|
59 |
+
def forward(self, signal):
|
60 |
+
conv_signal = self.conv(signal)
|
61 |
+
return conv_signal
|
62 |
+
|
63 |
+
|
64 |
+
def Embedding(num_embeddings, embedding_dim, padding_idx=None):
|
65 |
+
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
|
66 |
+
nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
|
67 |
+
if padding_idx is not None:
|
68 |
+
nn.init.constant_(m.weight[padding_idx], 0)
|
69 |
+
return m
|
70 |
+
|
71 |
+
|
72 |
+
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
|
73 |
+
if not export and torch.cuda.is_available():
|
74 |
+
try:
|
75 |
+
from apex.normalization import FusedLayerNorm
|
76 |
+
return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
|
77 |
+
except ImportError:
|
78 |
+
pass
|
79 |
+
return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
|
80 |
+
|
81 |
+
|
82 |
+
def Linear(in_features, out_features, bias=True):
|
83 |
+
m = nn.Linear(in_features, out_features, bias)
|
84 |
+
nn.init.xavier_uniform_(m.weight)
|
85 |
+
if bias:
|
86 |
+
nn.init.constant_(m.bias, 0.)
|
87 |
+
return m
|
88 |
+
|
89 |
+
|
90 |
+
class SinusoidalPositionalEmbedding(nn.Module):
|
91 |
+
"""This module produces sinusoidal positional embeddings of any length.
|
92 |
+
|
93 |
+
Padding symbols are ignored.
|
94 |
+
"""
|
95 |
+
|
96 |
+
def __init__(self, embedding_dim, padding_idx, init_size=1024):
|
97 |
+
super().__init__()
|
98 |
+
self.embedding_dim = embedding_dim
|
99 |
+
self.padding_idx = padding_idx
|
100 |
+
self.weights = SinusoidalPositionalEmbedding.get_embedding(
|
101 |
+
init_size,
|
102 |
+
embedding_dim,
|
103 |
+
padding_idx,
|
104 |
+
)
|
105 |
+
self.register_buffer('_float_tensor', torch.FloatTensor(1))
|
106 |
+
|
107 |
+
@staticmethod
|
108 |
+
def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
|
109 |
+
"""Build sinusoidal embeddings.
|
110 |
+
|
111 |
+
This matches the implementation in tensor2tensor, but differs slightly
|
112 |
+
from the description in Section 3.5 of "Attention Is All You Need".
|
113 |
+
"""
|
114 |
+
half_dim = embedding_dim // 2
|
115 |
+
emb = math.log(10000) / (half_dim - 1)
|
116 |
+
emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
|
117 |
+
emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
|
118 |
+
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
|
119 |
+
if embedding_dim % 2 == 1:
|
120 |
+
# zero pad
|
121 |
+
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
|
122 |
+
if padding_idx is not None:
|
123 |
+
emb[padding_idx, :] = 0
|
124 |
+
return emb
|
125 |
+
|
126 |
+
def forward(self, input, incremental_state=None, timestep=None, positions=None, **kwargs):
|
127 |
+
"""Input is expected to be of size [bsz x seqlen]."""
|
128 |
+
bsz, seq_len = input.shape[:2]
|
129 |
+
max_pos = self.padding_idx + 1 + seq_len
|
130 |
+
if self.weights is None or max_pos > self.weights.size(0):
|
131 |
+
# recompute/expand embeddings if needed
|
132 |
+
self.weights = SinusoidalPositionalEmbedding.get_embedding(
|
133 |
+
max_pos,
|
134 |
+
self.embedding_dim,
|
135 |
+
self.padding_idx,
|
136 |
+
)
|
137 |
+
self.weights = self.weights.to(self._float_tensor)
|
138 |
+
|
139 |
+
if incremental_state is not None:
|
140 |
+
# positions is the same for every token when decoding a single step
|
141 |
+
pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
|
142 |
+
return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
|
143 |
+
|
144 |
+
positions = utils.make_positions(input, self.padding_idx) if positions is None else positions
|
145 |
+
return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
|
146 |
+
|
147 |
+
def max_positions(self):
|
148 |
+
"""Maximum number of supported positions."""
|
149 |
+
return int(1e5) # an arbitrary large number
|
150 |
+
|
151 |
+
|
152 |
+
class ConvTBC(nn.Module):
|
153 |
+
def __init__(self, in_channels, out_channels, kernel_size, padding=0):
|
154 |
+
super(ConvTBC, self).__init__()
|
155 |
+
self.in_channels = in_channels
|
156 |
+
self.out_channels = out_channels
|
157 |
+
self.kernel_size = kernel_size
|
158 |
+
self.padding = padding
|
159 |
+
|
160 |
+
self.weight = torch.nn.Parameter(torch.Tensor(
|
161 |
+
self.kernel_size, in_channels, out_channels))
|
162 |
+
self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
|
163 |
+
|
164 |
+
def forward(self, input):
|
165 |
+
return torch.conv_tbc(input.contiguous(), self.weight, self.bias, self.padding)
|
166 |
+
|
167 |
+
|
168 |
+
class MultiheadAttention(nn.Module):
|
169 |
+
def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0., bias=True,
|
170 |
+
add_bias_kv=False, add_zero_attn=False, self_attention=False,
|
171 |
+
encoder_decoder_attention=False):
|
172 |
+
super().__init__()
|
173 |
+
self.embed_dim = embed_dim
|
174 |
+
self.kdim = kdim if kdim is not None else embed_dim
|
175 |
+
self.vdim = vdim if vdim is not None else embed_dim
|
176 |
+
self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
|
177 |
+
|
178 |
+
self.num_heads = num_heads
|
179 |
+
self.dropout = dropout
|
180 |
+
self.head_dim = embed_dim // num_heads
|
181 |
+
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
|
182 |
+
self.scaling = self.head_dim ** -0.5
|
183 |
+
|
184 |
+
self.self_attention = self_attention
|
185 |
+
self.encoder_decoder_attention = encoder_decoder_attention
|
186 |
+
|
187 |
+
assert not self.self_attention or self.qkv_same_dim, 'Self-attention requires query, key and ' \
|
188 |
+
'value to be of the same size'
|
189 |
+
|
190 |
+
if self.qkv_same_dim:
|
191 |
+
self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
|
192 |
+
else:
|
193 |
+
self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
|
194 |
+
self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
|
195 |
+
self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
|
196 |
+
|
197 |
+
if bias:
|
198 |
+
self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
|
199 |
+
else:
|
200 |
+
self.register_parameter('in_proj_bias', None)
|
201 |
+
|
202 |
+
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
203 |
+
|
204 |
+
if add_bias_kv:
|
205 |
+
self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
|
206 |
+
self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
|
207 |
+
else:
|
208 |
+
self.bias_k = self.bias_v = None
|
209 |
+
|
210 |
+
self.add_zero_attn = add_zero_attn
|
211 |
+
|
212 |
+
self.reset_parameters()
|
213 |
+
|
214 |
+
self.enable_torch_version = False
|
215 |
+
if hasattr(F, "multi_head_attention_forward"):
|
216 |
+
self.enable_torch_version = True
|
217 |
+
else:
|
218 |
+
self.enable_torch_version = False
|
219 |
+
self.last_attn_probs = None
|
220 |
+
|
221 |
+
def reset_parameters(self):
|
222 |
+
if self.qkv_same_dim:
|
223 |
+
nn.init.xavier_uniform_(self.in_proj_weight)
|
224 |
+
else:
|
225 |
+
nn.init.xavier_uniform_(self.k_proj_weight)
|
226 |
+
nn.init.xavier_uniform_(self.v_proj_weight)
|
227 |
+
nn.init.xavier_uniform_(self.q_proj_weight)
|
228 |
+
|
229 |
+
nn.init.xavier_uniform_(self.out_proj.weight)
|
230 |
+
if self.in_proj_bias is not None:
|
231 |
+
nn.init.constant_(self.in_proj_bias, 0.)
|
232 |
+
nn.init.constant_(self.out_proj.bias, 0.)
|
233 |
+
if self.bias_k is not None:
|
234 |
+
nn.init.xavier_normal_(self.bias_k)
|
235 |
+
if self.bias_v is not None:
|
236 |
+
nn.init.xavier_normal_(self.bias_v)
|
237 |
+
|
238 |
+
def forward(
|
239 |
+
self,
|
240 |
+
query, key, value,
|
241 |
+
key_padding_mask=None,
|
242 |
+
incremental_state=None,
|
243 |
+
need_weights=True,
|
244 |
+
static_kv=False,
|
245 |
+
attn_mask=None,
|
246 |
+
before_softmax=False,
|
247 |
+
need_head_weights=False,
|
248 |
+
enc_dec_attn_constraint_mask=None,
|
249 |
+
reset_attn_weight=None
|
250 |
+
):
|
251 |
+
"""Input shape: Time x Batch x Channel
|
252 |
+
|
253 |
+
Args:
|
254 |
+
key_padding_mask (ByteTensor, optional): mask to exclude
|
255 |
+
keys that are pads, of shape `(batch, src_len)`, where
|
256 |
+
padding elements are indicated by 1s.
|
257 |
+
need_weights (bool, optional): return the attention weights,
|
258 |
+
averaged over heads (default: False).
|
259 |
+
attn_mask (ByteTensor, optional): typically used to
|
260 |
+
implement causal attention, where the mask prevents the
|
261 |
+
attention from looking forward in time (default: None).
|
262 |
+
before_softmax (bool, optional): return the raw attention
|
263 |
+
weights and values before the attention softmax.
|
264 |
+
need_head_weights (bool, optional): return the attention
|
265 |
+
weights for each head. Implies *need_weights*. Default:
|
266 |
+
return the average attention weights over all heads.
|
267 |
+
"""
|
268 |
+
if need_head_weights:
|
269 |
+
need_weights = True
|
270 |
+
|
271 |
+
tgt_len, bsz, embed_dim = query.size()
|
272 |
+
assert embed_dim == self.embed_dim
|
273 |
+
assert list(query.size()) == [tgt_len, bsz, embed_dim]
|
274 |
+
|
275 |
+
if self.enable_torch_version and incremental_state is None and not static_kv and reset_attn_weight is None:
|
276 |
+
if self.qkv_same_dim:
|
277 |
+
return F.multi_head_attention_forward(query, key, value,
|
278 |
+
self.embed_dim, self.num_heads,
|
279 |
+
self.in_proj_weight,
|
280 |
+
self.in_proj_bias, self.bias_k, self.bias_v,
|
281 |
+
self.add_zero_attn, self.dropout,
|
282 |
+
self.out_proj.weight, self.out_proj.bias,
|
283 |
+
self.training, key_padding_mask, need_weights,
|
284 |
+
attn_mask)
|
285 |
+
else:
|
286 |
+
return F.multi_head_attention_forward(query, key, value,
|
287 |
+
self.embed_dim, self.num_heads,
|
288 |
+
torch.empty([0]),
|
289 |
+
self.in_proj_bias, self.bias_k, self.bias_v,
|
290 |
+
self.add_zero_attn, self.dropout,
|
291 |
+
self.out_proj.weight, self.out_proj.bias,
|
292 |
+
self.training, key_padding_mask, need_weights,
|
293 |
+
attn_mask, use_separate_proj_weight=True,
|
294 |
+
q_proj_weight=self.q_proj_weight,
|
295 |
+
k_proj_weight=self.k_proj_weight,
|
296 |
+
v_proj_weight=self.v_proj_weight)
|
297 |
+
|
298 |
+
if incremental_state is not None:
|
299 |
+
print('Not implemented error.')
|
300 |
+
exit()
|
301 |
+
else:
|
302 |
+
saved_state = None
|
303 |
+
|
304 |
+
if self.self_attention:
|
305 |
+
# self-attention
|
306 |
+
q, k, v = self.in_proj_qkv(query)
|
307 |
+
elif self.encoder_decoder_attention:
|
308 |
+
# encoder-decoder attention
|
309 |
+
q = self.in_proj_q(query)
|
310 |
+
if key is None:
|
311 |
+
assert value is None
|
312 |
+
k = v = None
|
313 |
+
else:
|
314 |
+
k = self.in_proj_k(key)
|
315 |
+
v = self.in_proj_v(key)
|
316 |
+
|
317 |
+
else:
|
318 |
+
q = self.in_proj_q(query)
|
319 |
+
k = self.in_proj_k(key)
|
320 |
+
v = self.in_proj_v(value)
|
321 |
+
q *= self.scaling
|
322 |
+
|
323 |
+
if self.bias_k is not None:
|
324 |
+
assert self.bias_v is not None
|
325 |
+
k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
|
326 |
+
v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
|
327 |
+
if attn_mask is not None:
|
328 |
+
attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
|
329 |
+
if key_padding_mask is not None:
|
330 |
+
key_padding_mask = torch.cat(
|
331 |
+
[key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
|
332 |
+
|
333 |
+
q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
|
334 |
+
if k is not None:
|
335 |
+
k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
|
336 |
+
if v is not None:
|
337 |
+
v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
|
338 |
+
|
339 |
+
if saved_state is not None:
|
340 |
+
print('Not implemented error.')
|
341 |
+
exit()
|
342 |
+
|
343 |
+
src_len = k.size(1)
|
344 |
+
|
345 |
+
# This is part of a workaround to get around fork/join parallelism
|
346 |
+
# not supporting Optional types.
|
347 |
+
if key_padding_mask is not None and key_padding_mask.shape == torch.Size([]):
|
348 |
+
key_padding_mask = None
|
349 |
+
|
350 |
+
if key_padding_mask is not None:
|
351 |
+
assert key_padding_mask.size(0) == bsz
|
352 |
+
assert key_padding_mask.size(1) == src_len
|
353 |
+
|
354 |
+
if self.add_zero_attn:
|
355 |
+
src_len += 1
|
356 |
+
k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
|
357 |
+
v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
|
358 |
+
if attn_mask is not None:
|
359 |
+
attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
|
360 |
+
if key_padding_mask is not None:
|
361 |
+
key_padding_mask = torch.cat(
|
362 |
+
[key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
|
363 |
+
|
364 |
+
attn_weights = torch.bmm(q, k.transpose(1, 2))
|
365 |
+
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
|
366 |
+
|
367 |
+
assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
|
368 |
+
|
369 |
+
if attn_mask is not None:
|
370 |
+
if len(attn_mask.shape) == 2:
|
371 |
+
attn_mask = attn_mask.unsqueeze(0)
|
372 |
+
elif len(attn_mask.shape) == 3:
|
373 |
+
attn_mask = attn_mask[:, None].repeat([1, self.num_heads, 1, 1]).reshape(
|
374 |
+
bsz * self.num_heads, tgt_len, src_len)
|
375 |
+
attn_weights = attn_weights + attn_mask
|
376 |
+
|
377 |
+
if enc_dec_attn_constraint_mask is not None: # bs x head x L_kv
|
378 |
+
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
379 |
+
attn_weights = attn_weights.masked_fill(
|
380 |
+
enc_dec_attn_constraint_mask.unsqueeze(2).bool(),
|
381 |
+
-1e9,
|
382 |
+
)
|
383 |
+
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
384 |
+
|
385 |
+
if key_padding_mask is not None:
|
386 |
+
# don't attend to padding symbols
|
387 |
+
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
388 |
+
attn_weights = attn_weights.masked_fill(
|
389 |
+
key_padding_mask.unsqueeze(1).unsqueeze(2),
|
390 |
+
-1e9,
|
391 |
+
)
|
392 |
+
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
393 |
+
|
394 |
+
attn_logits = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
395 |
+
|
396 |
+
if before_softmax:
|
397 |
+
return attn_weights, v
|
398 |
+
|
399 |
+
attn_weights_float = utils.softmax(attn_weights, dim=-1)
|
400 |
+
attn_weights = attn_weights_float.type_as(attn_weights)
|
401 |
+
attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
|
402 |
+
|
403 |
+
if reset_attn_weight is not None:
|
404 |
+
if reset_attn_weight:
|
405 |
+
self.last_attn_probs = attn_probs.detach()
|
406 |
+
else:
|
407 |
+
assert self.last_attn_probs is not None
|
408 |
+
attn_probs = self.last_attn_probs
|
409 |
+
attn = torch.bmm(attn_probs, v)
|
410 |
+
assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
|
411 |
+
attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
|
412 |
+
attn = self.out_proj(attn)
|
413 |
+
|
414 |
+
if need_weights:
|
415 |
+
attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
|
416 |
+
if not need_head_weights:
|
417 |
+
# average attention weights over heads
|
418 |
+
attn_weights = attn_weights.mean(dim=0)
|
419 |
+
else:
|
420 |
+
attn_weights = None
|
421 |
+
|
422 |
+
return attn, (attn_weights, attn_logits)
|
423 |
+
|
424 |
+
def in_proj_qkv(self, query):
|
425 |
+
return self._in_proj(query).chunk(3, dim=-1)
|
426 |
+
|
427 |
+
def in_proj_q(self, query):
|
428 |
+
if self.qkv_same_dim:
|
429 |
+
return self._in_proj(query, end=self.embed_dim)
|
430 |
+
else:
|
431 |
+
bias = self.in_proj_bias
|
432 |
+
if bias is not None:
|
433 |
+
bias = bias[:self.embed_dim]
|
434 |
+
return F.linear(query, self.q_proj_weight, bias)
|
435 |
+
|
436 |
+
def in_proj_k(self, key):
|
437 |
+
if self.qkv_same_dim:
|
438 |
+
return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
|
439 |
+
else:
|
440 |
+
weight = self.k_proj_weight
|
441 |
+
bias = self.in_proj_bias
|
442 |
+
if bias is not None:
|
443 |
+
bias = bias[self.embed_dim:2 * self.embed_dim]
|
444 |
+
return F.linear(key, weight, bias)
|
445 |
+
|
446 |
+
def in_proj_v(self, value):
|
447 |
+
if self.qkv_same_dim:
|
448 |
+
return self._in_proj(value, start=2 * self.embed_dim)
|
449 |
+
else:
|
450 |
+
weight = self.v_proj_weight
|
451 |
+
bias = self.in_proj_bias
|
452 |
+
if bias is not None:
|
453 |
+
bias = bias[2 * self.embed_dim:]
|
454 |
+
return F.linear(value, weight, bias)
|
455 |
+
|
456 |
+
def _in_proj(self, input, start=0, end=None):
|
457 |
+
weight = self.in_proj_weight
|
458 |
+
bias = self.in_proj_bias
|
459 |
+
weight = weight[start:end, :]
|
460 |
+
if bias is not None:
|
461 |
+
bias = bias[start:end]
|
462 |
+
return F.linear(input, weight, bias)
|
463 |
+
|
464 |
+
def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
|
465 |
+
return attn_weights
|
466 |
+
|
467 |
+
|
468 |
+
class Swish(torch.autograd.Function):
|
469 |
+
@staticmethod
|
470 |
+
def forward(ctx, i):
|
471 |
+
result = i * torch.sigmoid(i)
|
472 |
+
ctx.save_for_backward(i)
|
473 |
+
return result
|
474 |
+
|
475 |
+
@staticmethod
|
476 |
+
def backward(ctx, grad_output):
|
477 |
+
i = ctx.saved_variables[0]
|
478 |
+
sigmoid_i = torch.sigmoid(i)
|
479 |
+
return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
|
480 |
+
|
481 |
+
|
482 |
+
class CustomSwish(nn.Module):
|
483 |
+
def forward(self, input_tensor):
|
484 |
+
return Swish.apply(input_tensor)
|
485 |
+
|
486 |
+
|
487 |
+
class Mish(nn.Module):
|
488 |
+
def forward(self, x):
|
489 |
+
return x * torch.tanh(F.softplus(x))
|
490 |
+
|
491 |
+
|
492 |
+
class TransformerFFNLayer(nn.Module):
|
493 |
+
def __init__(self, hidden_size, filter_size, padding="SAME", kernel_size=1, dropout=0., act='gelu'):
|
494 |
+
super().__init__()
|
495 |
+
self.kernel_size = kernel_size
|
496 |
+
self.dropout = dropout
|
497 |
+
self.act = act
|
498 |
+
if padding == 'SAME':
|
499 |
+
self.ffn_1 = nn.Conv1d(hidden_size, filter_size, kernel_size, padding=kernel_size // 2)
|
500 |
+
elif padding == 'LEFT':
|
501 |
+
self.ffn_1 = nn.Sequential(
|
502 |
+
nn.ConstantPad1d((kernel_size - 1, 0), 0.0),
|
503 |
+
nn.Conv1d(hidden_size, filter_size, kernel_size)
|
504 |
+
)
|
505 |
+
self.ffn_2 = Linear(filter_size, hidden_size)
|
506 |
+
if self.act == 'swish':
|
507 |
+
self.swish_fn = CustomSwish()
|
508 |
+
|
509 |
+
def forward(self, x, incremental_state=None):
|
510 |
+
# x: T x B x C
|
511 |
+
if incremental_state is not None:
|
512 |
+
assert incremental_state is None, 'Nar-generation does not allow this.'
|
513 |
+
exit(1)
|
514 |
+
|
515 |
+
x = self.ffn_1(x.permute(1, 2, 0)).permute(2, 0, 1)
|
516 |
+
x = x * self.kernel_size ** -0.5
|
517 |
+
|
518 |
+
if incremental_state is not None:
|
519 |
+
x = x[-1:]
|
520 |
+
if self.act == 'gelu':
|
521 |
+
x = F.gelu(x)
|
522 |
+
if self.act == 'relu':
|
523 |
+
x = F.relu(x)
|
524 |
+
if self.act == 'swish':
|
525 |
+
x = self.swish_fn(x)
|
526 |
+
x = F.dropout(x, self.dropout, training=self.training)
|
527 |
+
x = self.ffn_2(x)
|
528 |
+
return x
|
529 |
+
|
530 |
+
|
531 |
+
class BatchNorm1dTBC(nn.Module):
|
532 |
+
def __init__(self, c):
|
533 |
+
super(BatchNorm1dTBC, self).__init__()
|
534 |
+
self.bn = nn.BatchNorm1d(c)
|
535 |
+
|
536 |
+
def forward(self, x):
|
537 |
+
"""
|
538 |
+
|
539 |
+
:param x: [T, B, C]
|
540 |
+
:return: [T, B, C]
|
541 |
+
"""
|
542 |
+
x = x.permute(1, 2, 0) # [B, C, T]
|
543 |
+
x = self.bn(x) # [B, C, T]
|
544 |
+
x = x.permute(2, 0, 1) # [T, B, C]
|
545 |
+
return x
|
546 |
+
|
547 |
+
|
548 |
+
class EncSALayer(nn.Module):
|
549 |
+
def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
|
550 |
+
relu_dropout=0.1, kernel_size=9, padding='SAME', norm='ln', act='gelu'):
|
551 |
+
super().__init__()
|
552 |
+
self.c = c
|
553 |
+
self.dropout = dropout
|
554 |
+
self.num_heads = num_heads
|
555 |
+
if num_heads > 0:
|
556 |
+
if norm == 'ln':
|
557 |
+
self.layer_norm1 = LayerNorm(c)
|
558 |
+
elif norm == 'bn':
|
559 |
+
self.layer_norm1 = BatchNorm1dTBC(c)
|
560 |
+
self.self_attn = MultiheadAttention(
|
561 |
+
self.c, num_heads, self_attention=True, dropout=attention_dropout, bias=False,
|
562 |
+
)
|
563 |
+
if norm == 'ln':
|
564 |
+
self.layer_norm2 = LayerNorm(c)
|
565 |
+
elif norm == 'bn':
|
566 |
+
self.layer_norm2 = BatchNorm1dTBC(c)
|
567 |
+
self.ffn = TransformerFFNLayer(
|
568 |
+
c, 4 * c, kernel_size=kernel_size, dropout=relu_dropout, padding=padding, act=act)
|
569 |
+
|
570 |
+
def forward(self, x, encoder_padding_mask=None, **kwargs):
|
571 |
+
layer_norm_training = kwargs.get('layer_norm_training', None)
|
572 |
+
if layer_norm_training is not None:
|
573 |
+
self.layer_norm1.training = layer_norm_training
|
574 |
+
self.layer_norm2.training = layer_norm_training
|
575 |
+
if self.num_heads > 0:
|
576 |
+
residual = x
|
577 |
+
x = self.layer_norm1(x)
|
578 |
+
x, _, = self.self_attn(
|
579 |
+
query=x,
|
580 |
+
key=x,
|
581 |
+
value=x,
|
582 |
+
key_padding_mask=encoder_padding_mask
|
583 |
+
)
|
584 |
+
x = F.dropout(x, self.dropout, training=self.training)
|
585 |
+
x = residual + x
|
586 |
+
x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
|
587 |
+
|
588 |
+
residual = x
|
589 |
+
x = self.layer_norm2(x)
|
590 |
+
x = self.ffn(x)
|
591 |
+
x = F.dropout(x, self.dropout, training=self.training)
|
592 |
+
x = residual + x
|
593 |
+
x = x * (1 - encoder_padding_mask.float()).transpose(0, 1)[..., None]
|
594 |
+
return x
|
595 |
+
|
596 |
+
|
597 |
+
class DecSALayer(nn.Module):
|
598 |
+
def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_dropout=0.1, kernel_size=9, act='gelu'):
|
599 |
+
super().__init__()
|
600 |
+
self.c = c
|
601 |
+
self.dropout = dropout
|
602 |
+
self.layer_norm1 = LayerNorm(c)
|
603 |
+
self.self_attn = MultiheadAttention(
|
604 |
+
c, num_heads, self_attention=True, dropout=attention_dropout, bias=False
|
605 |
+
)
|
606 |
+
self.layer_norm2 = LayerNorm(c)
|
607 |
+
self.encoder_attn = MultiheadAttention(
|
608 |
+
c, num_heads, encoder_decoder_attention=True, dropout=attention_dropout, bias=False,
|
609 |
+
)
|
610 |
+
self.layer_norm3 = LayerNorm(c)
|
611 |
+
self.ffn = TransformerFFNLayer(
|
612 |
+
c, 4 * c, padding='LEFT', kernel_size=kernel_size, dropout=relu_dropout, act=act)
|
613 |
+
|
614 |
+
def forward(
|
615 |
+
self,
|
616 |
+
x,
|
617 |
+
encoder_out=None,
|
618 |
+
encoder_padding_mask=None,
|
619 |
+
incremental_state=None,
|
620 |
+
self_attn_mask=None,
|
621 |
+
self_attn_padding_mask=None,
|
622 |
+
attn_out=None,
|
623 |
+
reset_attn_weight=None,
|
624 |
+
**kwargs,
|
625 |
+
):
|
626 |
+
layer_norm_training = kwargs.get('layer_norm_training', None)
|
627 |
+
if layer_norm_training is not None:
|
628 |
+
self.layer_norm1.training = layer_norm_training
|
629 |
+
self.layer_norm2.training = layer_norm_training
|
630 |
+
self.layer_norm3.training = layer_norm_training
|
631 |
+
residual = x
|
632 |
+
x = self.layer_norm1(x)
|
633 |
+
x, _ = self.self_attn(
|
634 |
+
query=x,
|
635 |
+
key=x,
|
636 |
+
value=x,
|
637 |
+
key_padding_mask=self_attn_padding_mask,
|
638 |
+
incremental_state=incremental_state,
|
639 |
+
attn_mask=self_attn_mask
|
640 |
+
)
|
641 |
+
x = F.dropout(x, self.dropout, training=self.training)
|
642 |
+
x = residual + x
|
643 |
+
|
644 |
+
residual = x
|
645 |
+
x = self.layer_norm2(x)
|
646 |
+
if encoder_out is not None:
|
647 |
+
x, attn = self.encoder_attn(
|
648 |
+
query=x,
|
649 |
+
key=encoder_out,
|
650 |
+
value=encoder_out,
|
651 |
+
key_padding_mask=encoder_padding_mask,
|
652 |
+
incremental_state=incremental_state,
|
653 |
+
static_kv=True,
|
654 |
+
enc_dec_attn_constraint_mask=None,
|
655 |
+
# utils.get_incremental_state(self, incremental_state, 'enc_dec_attn_constraint_mask'),
|
656 |
+
reset_attn_weight=reset_attn_weight
|
657 |
+
)
|
658 |
+
attn_logits = attn[1]
|
659 |
+
else:
|
660 |
+
assert attn_out is not None
|
661 |
+
x = self.encoder_attn.in_proj_v(attn_out.transpose(0, 1))
|
662 |
+
attn_logits = None
|
663 |
+
x = F.dropout(x, self.dropout, training=self.training)
|
664 |
+
x = residual + x
|
665 |
+
|
666 |
+
residual = x
|
667 |
+
x = self.layer_norm3(x)
|
668 |
+
x = self.ffn(x, incremental_state=incremental_state)
|
669 |
+
x = F.dropout(x, self.dropout, training=self.training)
|
670 |
+
x = residual + x
|
671 |
+
# if len(attn_logits.size()) > 3:
|
672 |
+
# indices = attn_logits.softmax(-1).max(-1).values.sum(-1).argmax(-1)
|
673 |
+
# attn_logits = attn_logits.gather(1,
|
674 |
+
# indices[:, None, None, None].repeat(1, 1, attn_logits.size(-2), attn_logits.size(-1))).squeeze(1)
|
675 |
+
return x, attn_logits
|
modules/commons/ssim.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Adapted from https://github.com/Po-Hsun-Su/pytorch-ssim
|
3 |
+
"""
|
4 |
+
|
5 |
+
from math import exp
|
6 |
+
|
7 |
+
import torch
|
8 |
+
import torch.nn.functional as F
|
9 |
+
from torch.autograd import Variable
|
10 |
+
|
11 |
+
|
12 |
+
def gaussian(window_size, sigma):
|
13 |
+
gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
|
14 |
+
return gauss / gauss.sum()
|
15 |
+
|
16 |
+
|
17 |
+
def create_window(window_size, channel):
|
18 |
+
_1D_window = gaussian(window_size, 1.5).unsqueeze(1)
|
19 |
+
_2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
|
20 |
+
window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
|
21 |
+
return window
|
22 |
+
|
23 |
+
|
24 |
+
def _ssim(img1, img2, window, window_size, channel, size_average=True):
|
25 |
+
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
|
26 |
+
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
|
27 |
+
|
28 |
+
mu1_sq = mu1.pow(2)
|
29 |
+
mu2_sq = mu2.pow(2)
|
30 |
+
mu1_mu2 = mu1 * mu2
|
31 |
+
|
32 |
+
sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
|
33 |
+
sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
|
34 |
+
sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
|
35 |
+
|
36 |
+
C1 = 0.01 ** 2
|
37 |
+
C2 = 0.03 ** 2
|
38 |
+
|
39 |
+
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
|
40 |
+
|
41 |
+
if size_average:
|
42 |
+
return ssim_map.mean()
|
43 |
+
else:
|
44 |
+
return ssim_map.mean(1)
|
45 |
+
|
46 |
+
|
47 |
+
class SSIM(torch.nn.Module):
|
48 |
+
def __init__(self, window_size=11, size_average=True):
|
49 |
+
super(SSIM, self).__init__()
|
50 |
+
self.window_size = window_size
|
51 |
+
self.size_average = size_average
|
52 |
+
self.channel = 1
|
53 |
+
self.window = create_window(window_size, self.channel)
|
54 |
+
|
55 |
+
def forward(self, img1, img2):
|
56 |
+
(_, channel, _, _) = img1.size()
|
57 |
+
|
58 |
+
if channel == self.channel and self.window.data.type() == img1.data.type():
|
59 |
+
window = self.window
|
60 |
+
else:
|
61 |
+
window = create_window(self.window_size, channel)
|
62 |
+
|
63 |
+
if img1.is_cuda:
|
64 |
+
window = window.cuda(img1.get_device())
|
65 |
+
window = window.type_as(img1)
|
66 |
+
|
67 |
+
self.window = window
|
68 |
+
self.channel = channel
|
69 |
+
|
70 |
+
return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
|
71 |
+
|
72 |
+
|
73 |
+
window = None
|
74 |
+
|
75 |
+
|
76 |
+
def ssim(img1, img2, window_size=11, size_average=True):
|
77 |
+
(_, channel, _, _) = img1.size()
|
78 |
+
global window
|
79 |
+
if window is None:
|
80 |
+
window = create_window(window_size, channel)
|
81 |
+
if img1.is_cuda:
|
82 |
+
window = window.cuda(img1.get_device())
|
83 |
+
window = window.type_as(img1)
|
84 |
+
return _ssim(img1, img2, window, window_size, channel, size_average)
|
modules/diff/__pycache__/diffusion.cpython-310.pyc
ADDED
Binary file (11 kB). View file
|
|
modules/diff/__pycache__/diffusion.cpython-38.pyc
ADDED
Binary file (11 kB). View file
|
|
modules/diff/__pycache__/net.cpython-310.pyc
ADDED
Binary file (4.57 kB). View file
|
|
modules/diff/__pycache__/net.cpython-38.pyc
ADDED
Binary file (4.61 kB). View file
|
|
modules/diff/diffusion.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import deque
|
2 |
+
from functools import partial
|
3 |
+
from inspect import isfunction
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
import torch.nn.functional as F
|
8 |
+
from torch import nn
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from modules.encoder import SvcEncoder
|
12 |
+
from training.train_pipeline import Batch2Loss
|
13 |
+
from utils.hparams import hparams
|
14 |
+
|
15 |
+
|
16 |
+
def exists(x):
|
17 |
+
return x is not None
|
18 |
+
|
19 |
+
|
20 |
+
def default(val, d):
|
21 |
+
if exists(val):
|
22 |
+
return val
|
23 |
+
return d() if isfunction(d) else d
|
24 |
+
|
25 |
+
|
26 |
+
# gaussian diffusion trainer class
|
27 |
+
|
28 |
+
def extract(a, t, x_shape):
|
29 |
+
b, *_ = t.shape
|
30 |
+
out = a.gather(-1, t)
|
31 |
+
return out.reshape(b, *((1,) * (len(x_shape) - 1)))
|
32 |
+
|
33 |
+
|
34 |
+
def noise_like(shape, device, repeat=False):
|
35 |
+
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
36 |
+
noise = lambda: torch.randn(shape, device=device)
|
37 |
+
return repeat_noise() if repeat else noise()
|
38 |
+
|
39 |
+
|
40 |
+
def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.01)):
|
41 |
+
"""
|
42 |
+
linear schedule
|
43 |
+
"""
|
44 |
+
betas = np.linspace(1e-4, max_beta, timesteps)
|
45 |
+
return betas
|
46 |
+
|
47 |
+
|
48 |
+
def cosine_beta_schedule(timesteps, s=0.008):
|
49 |
+
"""
|
50 |
+
cosine schedule
|
51 |
+
as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
|
52 |
+
"""
|
53 |
+
steps = timesteps + 1
|
54 |
+
x = np.linspace(0, steps, steps)
|
55 |
+
alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2
|
56 |
+
alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
|
57 |
+
betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
|
58 |
+
return np.clip(betas, a_min=0, a_max=0.999)
|
59 |
+
|
60 |
+
|
61 |
+
beta_schedule = {
|
62 |
+
"cosine": cosine_beta_schedule,
|
63 |
+
"linear": linear_beta_schedule,
|
64 |
+
}
|
65 |
+
|
66 |
+
|
67 |
+
class GaussianDiffusion(nn.Module):
|
68 |
+
def __init__(self, phone_encoder, out_dims, denoise_fn,
|
69 |
+
timesteps=1000, K_step=1000, loss_type=hparams.get('diff_loss_type', 'l1'), betas=None, spec_min=None,
|
70 |
+
spec_max=None):
|
71 |
+
super().__init__()
|
72 |
+
self.denoise_fn = denoise_fn
|
73 |
+
self.fs2 = SvcEncoder(phone_encoder, out_dims)
|
74 |
+
self.mel_bins = out_dims
|
75 |
+
|
76 |
+
if exists(betas):
|
77 |
+
betas = betas.detach().cpu().numpy() if isinstance(betas, torch.Tensor) else betas
|
78 |
+
else:
|
79 |
+
if 'schedule_type' in hparams.keys():
|
80 |
+
betas = beta_schedule[hparams['schedule_type']](timesteps)
|
81 |
+
else:
|
82 |
+
betas = cosine_beta_schedule(timesteps)
|
83 |
+
|
84 |
+
alphas = 1. - betas
|
85 |
+
alphas_cumprod = np.cumprod(alphas, axis=0)
|
86 |
+
alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
|
87 |
+
|
88 |
+
timesteps, = betas.shape
|
89 |
+
self.num_timesteps = int(timesteps)
|
90 |
+
self.K_step = K_step
|
91 |
+
self.loss_type = loss_type
|
92 |
+
|
93 |
+
self.noise_list = deque(maxlen=4)
|
94 |
+
|
95 |
+
to_torch = partial(torch.tensor, dtype=torch.float32)
|
96 |
+
|
97 |
+
self.register_buffer('betas', to_torch(betas))
|
98 |
+
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
99 |
+
self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
|
100 |
+
|
101 |
+
# calculations for diffusion q(x_t | x_{t-1}) and others
|
102 |
+
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
|
103 |
+
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
|
104 |
+
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
|
105 |
+
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
|
106 |
+
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
|
107 |
+
|
108 |
+
# calculations for posterior q(x_{t-1} | x_t, x_0)
|
109 |
+
posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
|
110 |
+
# above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
|
111 |
+
self.register_buffer('posterior_variance', to_torch(posterior_variance))
|
112 |
+
# below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
|
113 |
+
self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
|
114 |
+
self.register_buffer('posterior_mean_coef1', to_torch(
|
115 |
+
betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
|
116 |
+
self.register_buffer('posterior_mean_coef2', to_torch(
|
117 |
+
(1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
|
118 |
+
|
119 |
+
self.register_buffer('spec_min', torch.FloatTensor(spec_min)[None, None, :hparams['keep_bins']])
|
120 |
+
self.register_buffer('spec_max', torch.FloatTensor(spec_max)[None, None, :hparams['keep_bins']])
|
121 |
+
|
122 |
+
def q_mean_variance(self, x_start, t):
|
123 |
+
mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
|
124 |
+
variance = extract(1. - self.alphas_cumprod, t, x_start.shape)
|
125 |
+
log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape)
|
126 |
+
return mean, variance, log_variance
|
127 |
+
|
128 |
+
def predict_start_from_noise(self, x_t, t, noise):
|
129 |
+
return (
|
130 |
+
extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
|
131 |
+
extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
|
132 |
+
)
|
133 |
+
|
134 |
+
def q_posterior(self, x_start, x_t, t):
|
135 |
+
posterior_mean = (
|
136 |
+
extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
|
137 |
+
extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
|
138 |
+
)
|
139 |
+
posterior_variance = extract(self.posterior_variance, t, x_t.shape)
|
140 |
+
posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
|
141 |
+
return posterior_mean, posterior_variance, posterior_log_variance_clipped
|
142 |
+
|
143 |
+
def p_mean_variance(self, x, t, cond, clip_denoised: bool):
|
144 |
+
noise_pred = self.denoise_fn(x, t, cond=cond)
|
145 |
+
x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
|
146 |
+
|
147 |
+
if clip_denoised:
|
148 |
+
x_recon.clamp_(-1., 1.)
|
149 |
+
|
150 |
+
model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
|
151 |
+
return model_mean, posterior_variance, posterior_log_variance
|
152 |
+
|
153 |
+
@torch.no_grad()
|
154 |
+
def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
|
155 |
+
b, *_, device = *x.shape, x.device
|
156 |
+
model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond, clip_denoised=clip_denoised)
|
157 |
+
noise = noise_like(x.shape, device, repeat_noise)
|
158 |
+
# no noise when t == 0
|
159 |
+
nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
|
160 |
+
return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
|
161 |
+
|
162 |
+
@torch.no_grad()
|
163 |
+
def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
|
164 |
+
"""
|
165 |
+
Use the PLMS method from [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
|
166 |
+
"""
|
167 |
+
|
168 |
+
def get_x_pred(x, noise_t, t):
|
169 |
+
a_t = extract(self.alphas_cumprod, t, x.shape)
|
170 |
+
a_prev = extract(self.alphas_cumprod, torch.max(t - interval, torch.zeros_like(t)), x.shape)
|
171 |
+
a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
|
172 |
+
|
173 |
+
x_delta = (a_prev - a_t) * ((1 / (a_t_sq * (a_t_sq + a_prev_sq))) * x - 1 / (
|
174 |
+
a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t)
|
175 |
+
x_pred = x + x_delta
|
176 |
+
|
177 |
+
return x_pred
|
178 |
+
|
179 |
+
noise_list = self.noise_list
|
180 |
+
noise_pred = self.denoise_fn(x, t, cond=cond)
|
181 |
+
|
182 |
+
if len(noise_list) == 0:
|
183 |
+
x_pred = get_x_pred(x, noise_pred, t)
|
184 |
+
noise_pred_prev = self.denoise_fn(x_pred, max(t - interval, 0), cond=cond)
|
185 |
+
noise_pred_prime = (noise_pred + noise_pred_prev) / 2
|
186 |
+
elif len(noise_list) == 1:
|
187 |
+
noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
|
188 |
+
elif len(noise_list) == 2:
|
189 |
+
noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
|
190 |
+
elif len(noise_list) >= 3:
|
191 |
+
noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
|
192 |
+
|
193 |
+
x_prev = get_x_pred(x, noise_pred_prime, t)
|
194 |
+
noise_list.append(noise_pred)
|
195 |
+
|
196 |
+
return x_prev
|
197 |
+
|
198 |
+
def q_sample(self, x_start, t, noise=None):
|
199 |
+
noise = default(noise, lambda: torch.randn_like(x_start))
|
200 |
+
return (
|
201 |
+
extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
|
202 |
+
extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
|
203 |
+
)
|
204 |
+
|
205 |
+
def p_losses(self, x_start, t, cond, noise=None, nonpadding=None):
|
206 |
+
noise = default(noise, lambda: torch.randn_like(x_start))
|
207 |
+
|
208 |
+
x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
|
209 |
+
x_recon = self.denoise_fn(x_noisy, t, cond)
|
210 |
+
|
211 |
+
if self.loss_type == 'l1':
|
212 |
+
if nonpadding is not None:
|
213 |
+
loss = ((noise - x_recon).abs() * nonpadding.unsqueeze(1)).mean()
|
214 |
+
else:
|
215 |
+
# print('are you sure w/o nonpadding?')
|
216 |
+
loss = (noise - x_recon).abs().mean()
|
217 |
+
|
218 |
+
elif self.loss_type == 'l2':
|
219 |
+
loss = F.mse_loss(noise, x_recon)
|
220 |
+
else:
|
221 |
+
raise NotImplementedError()
|
222 |
+
|
223 |
+
return loss
|
224 |
+
|
225 |
+
def forward(self, hubert, mel2ph=None, spk_embed=None,
|
226 |
+
ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
|
227 |
+
'''
|
228 |
+
conditioning diffusion, use fastspeech2 encoder output as the condition
|
229 |
+
'''
|
230 |
+
ret = self.fs2(hubert, mel2ph, spk_embed, None, f0, uv, energy,
|
231 |
+
skip_decoder=True, infer=infer, **kwargs)
|
232 |
+
cond = ret['decoder_inp'].transpose(1, 2)
|
233 |
+
b, *_, device = *hubert.shape, hubert.device
|
234 |
+
|
235 |
+
if not infer:
|
236 |
+
Batch2Loss.module4(
|
237 |
+
self.p_losses,
|
238 |
+
self.norm_spec(ref_mels), cond, ret, self.K_step, b, device
|
239 |
+
)
|
240 |
+
else:
|
241 |
+
if 'use_gt_mel' in kwargs.keys() and kwargs['use_gt_mel']:
|
242 |
+
t = kwargs['add_noise_step']
|
243 |
+
print('===>using ground truth mel as start, please make sure parameter "key==0" !')
|
244 |
+
fs2_mels = ref_mels
|
245 |
+
fs2_mels = self.norm_spec(fs2_mels)
|
246 |
+
fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
|
247 |
+
x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
|
248 |
+
else:
|
249 |
+
t = self.K_step
|
250 |
+
shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
|
251 |
+
x = torch.randn(shape, device=device)
|
252 |
+
if hparams.get('pndm_speedup') and hparams['pndm_speedup'] > 1:
|
253 |
+
self.noise_list = deque(maxlen=4)
|
254 |
+
iteration_interval = hparams['pndm_speedup']
|
255 |
+
for i in tqdm(reversed(range(0, t, iteration_interval)), desc='sample time step',
|
256 |
+
total=t // iteration_interval):
|
257 |
+
x = self.p_sample_plms(x, torch.full((b,), i, device=device, dtype=torch.long), iteration_interval,
|
258 |
+
cond)
|
259 |
+
else:
|
260 |
+
for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
|
261 |
+
x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
|
262 |
+
x = x[:, 0].transpose(1, 2)
|
263 |
+
if mel2ph is not None: # for singing
|
264 |
+
ret['mel_out'] = self.denorm_spec(x) * ((mel2ph > 0).float()[:, :, None])
|
265 |
+
else:
|
266 |
+
ret['mel_out'] = self.denorm_spec(x)
|
267 |
+
return ret
|
268 |
+
|
269 |
+
def norm_spec(self, x):
|
270 |
+
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
|
271 |
+
|
272 |
+
def denorm_spec(self, x):
|
273 |
+
return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
|
274 |
+
|
275 |
+
def out2mel(self, x):
|
276 |
+
return x
|
277 |
+
|
278 |
+
|
279 |
+
class OfflineGaussianDiffusion(GaussianDiffusion):
|
280 |
+
def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
|
281 |
+
ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
|
282 |
+
b, *_, device = *txt_tokens.shape, txt_tokens.device
|
283 |
+
|
284 |
+
ret = self.fs2(txt_tokens, mel2ph, spk_embed, ref_mels, f0, uv, energy,
|
285 |
+
skip_decoder=True, infer=True, **kwargs)
|
286 |
+
cond = ret['decoder_inp'].transpose(1, 2)
|
287 |
+
fs2_mels = ref_mels[1]
|
288 |
+
ref_mels = ref_mels[0]
|
289 |
+
|
290 |
+
if not infer:
|
291 |
+
t = torch.randint(0, self.K_step, (b,), device=device).long()
|
292 |
+
x = ref_mels
|
293 |
+
x = self.norm_spec(x)
|
294 |
+
x = x.transpose(1, 2)[:, None, :, :] # [B, 1, M, T]
|
295 |
+
ret['diff_loss'] = self.p_losses(x, t, cond)
|
296 |
+
else:
|
297 |
+
t = self.K_step
|
298 |
+
fs2_mels = self.norm_spec(fs2_mels)
|
299 |
+
fs2_mels = fs2_mels.transpose(1, 2)[:, None, :, :]
|
300 |
+
|
301 |
+
x = self.q_sample(x_start=fs2_mels, t=torch.tensor([t - 1], device=device).long())
|
302 |
+
|
303 |
+
if hparams.get('gaussian_start') is not None and hparams['gaussian_start']:
|
304 |
+
print('===> gaussion start.')
|
305 |
+
shape = (cond.shape[0], 1, self.mel_bins, cond.shape[2])
|
306 |
+
x = torch.randn(shape, device=device)
|
307 |
+
for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
|
308 |
+
x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
|
309 |
+
x = x[:, 0].transpose(1, 2)
|
310 |
+
ret['mel_out'] = self.denorm_spec(x)
|
311 |
+
|
312 |
+
return ret
|
modules/diff/net.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from math import sqrt
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
|
8 |
+
from modules.commons.common_layers import Mish
|
9 |
+
from utils.hparams import hparams
|
10 |
+
|
11 |
+
Linear = nn.Linear
|
12 |
+
ConvTranspose2d = nn.ConvTranspose2d
|
13 |
+
|
14 |
+
|
15 |
+
class AttrDict(dict):
|
16 |
+
def __init__(self, *args, **kwargs):
|
17 |
+
super(AttrDict, self).__init__(*args, **kwargs)
|
18 |
+
self.__dict__ = self
|
19 |
+
|
20 |
+
def override(self, attrs):
|
21 |
+
if isinstance(attrs, dict):
|
22 |
+
self.__dict__.update(**attrs)
|
23 |
+
elif isinstance(attrs, (list, tuple, set)):
|
24 |
+
for attr in attrs:
|
25 |
+
self.override(attr)
|
26 |
+
elif attrs is not None:
|
27 |
+
raise NotImplementedError
|
28 |
+
return self
|
29 |
+
|
30 |
+
|
31 |
+
class SinusoidalPosEmb(nn.Module):
|
32 |
+
def __init__(self, dim):
|
33 |
+
super().__init__()
|
34 |
+
self.dim = dim
|
35 |
+
|
36 |
+
def forward(self, x):
|
37 |
+
device = x.device
|
38 |
+
half_dim = self.dim // 2
|
39 |
+
emb = math.log(10000) / (half_dim - 1)
|
40 |
+
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
|
41 |
+
emb = x[:, None] * emb[None, :]
|
42 |
+
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
|
43 |
+
return emb
|
44 |
+
|
45 |
+
|
46 |
+
def Conv1d(*args, **kwargs):
|
47 |
+
layer = nn.Conv1d(*args, **kwargs)
|
48 |
+
nn.init.kaiming_normal_(layer.weight)
|
49 |
+
return layer
|
50 |
+
|
51 |
+
|
52 |
+
@torch.jit.script
|
53 |
+
def silu(x):
|
54 |
+
return x * torch.sigmoid(x)
|
55 |
+
|
56 |
+
|
57 |
+
class ResidualBlock(nn.Module):
|
58 |
+
def __init__(self, encoder_hidden, residual_channels, dilation):
|
59 |
+
super().__init__()
|
60 |
+
self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
|
61 |
+
self.diffusion_projection = Linear(residual_channels, residual_channels)
|
62 |
+
self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1)
|
63 |
+
self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
|
64 |
+
|
65 |
+
def forward(self, x, conditioner, diffusion_step):
|
66 |
+
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
|
67 |
+
conditioner = self.conditioner_projection(conditioner)
|
68 |
+
y = x + diffusion_step
|
69 |
+
|
70 |
+
y = self.dilated_conv(y) + conditioner
|
71 |
+
|
72 |
+
gate, filter = torch.chunk(y, 2, dim=1)
|
73 |
+
# Using torch.split instead of torch.chunk to avoid using onnx::Slice
|
74 |
+
# gate, filter = torch.split(y, torch.div(y.shape[1], 2), dim=1)
|
75 |
+
|
76 |
+
y = torch.sigmoid(gate) * torch.tanh(filter)
|
77 |
+
|
78 |
+
y = self.output_projection(y)
|
79 |
+
residual, skip = torch.chunk(y, 2, dim=1)
|
80 |
+
# Using torch.split instead of torch.chunk to avoid using onnx::Slice
|
81 |
+
# residual, skip = torch.split(y, torch.div(y.shape[1], 2), dim=1)
|
82 |
+
|
83 |
+
return (x + residual) / sqrt(2.0), skip
|
84 |
+
|
85 |
+
|
86 |
+
class DiffNet(nn.Module):
|
87 |
+
def __init__(self, in_dims=80):
|
88 |
+
super().__init__()
|
89 |
+
self.params = params = AttrDict(
|
90 |
+
# Model params
|
91 |
+
encoder_hidden=hparams['hidden_size'],
|
92 |
+
residual_layers=hparams['residual_layers'],
|
93 |
+
residual_channels=hparams['residual_channels'],
|
94 |
+
dilation_cycle_length=hparams['dilation_cycle_length'],
|
95 |
+
)
|
96 |
+
self.input_projection = Conv1d(in_dims, params.residual_channels, 1)
|
97 |
+
self.diffusion_embedding = SinusoidalPosEmb(params.residual_channels)
|
98 |
+
dim = params.residual_channels
|
99 |
+
self.mlp = nn.Sequential(
|
100 |
+
nn.Linear(dim, dim * 4),
|
101 |
+
Mish(),
|
102 |
+
nn.Linear(dim * 4, dim)
|
103 |
+
)
|
104 |
+
self.residual_layers = nn.ModuleList([
|
105 |
+
ResidualBlock(params.encoder_hidden, params.residual_channels, 2 ** (i % params.dilation_cycle_length))
|
106 |
+
for i in range(params.residual_layers)
|
107 |
+
])
|
108 |
+
self.skip_projection = Conv1d(params.residual_channels, params.residual_channels, 1)
|
109 |
+
self.output_projection = Conv1d(params.residual_channels, in_dims, 1)
|
110 |
+
nn.init.zeros_(self.output_projection.weight)
|
111 |
+
|
112 |
+
def forward(self, spec, diffusion_step, cond):
|
113 |
+
"""
|
114 |
+
|
115 |
+
:param spec: [B, 1, M, T]
|
116 |
+
:param diffusion_step: [B, 1]
|
117 |
+
:param cond: [B, M, T]
|
118 |
+
:return:
|
119 |
+
"""
|
120 |
+
x = spec[:, 0]
|
121 |
+
x = self.input_projection(x) # x [B, residual_channel, T]
|
122 |
+
|
123 |
+
x = F.relu(x)
|
124 |
+
diffusion_step = self.diffusion_embedding(diffusion_step)
|
125 |
+
diffusion_step = self.mlp(diffusion_step)
|
126 |
+
skip = []
|
127 |
+
for layer_id, layer in enumerate(self.residual_layers):
|
128 |
+
x, skip_connection = layer(x, cond, diffusion_step)
|
129 |
+
skip.append(skip_connection)
|
130 |
+
|
131 |
+
x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
|
132 |
+
x = self.skip_projection(x)
|
133 |
+
x = F.relu(x)
|
134 |
+
x = self.output_projection(x) # [B, 80, T]
|
135 |
+
return x[:, None, :, :]
|
modules/encoder.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from modules.commons.common_layers import *
|
4 |
+
from modules.commons.common_layers import Embedding
|
5 |
+
from modules.commons.common_layers import SinusoidalPositionalEmbedding
|
6 |
+
from utils.hparams import hparams
|
7 |
+
from utils.pitch_utils import f0_to_coarse, denorm_f0
|
8 |
+
|
9 |
+
|
10 |
+
class LayerNorm(torch.nn.LayerNorm):
|
11 |
+
"""Layer normalization module.
|
12 |
+
:param int nout: output dim size
|
13 |
+
:param int dim: dimension to be normalized
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, nout, dim=-1):
|
17 |
+
"""Construct an LayerNorm object."""
|
18 |
+
super(LayerNorm, self).__init__(nout, eps=1e-12)
|
19 |
+
self.dim = dim
|
20 |
+
|
21 |
+
def forward(self, x):
|
22 |
+
"""Apply layer normalization.
|
23 |
+
:param torch.Tensor x: input tensor
|
24 |
+
:return: layer normalized tensor
|
25 |
+
:rtype torch.Tensor
|
26 |
+
"""
|
27 |
+
if self.dim == -1:
|
28 |
+
return super(LayerNorm, self).forward(x)
|
29 |
+
return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
|
30 |
+
|
31 |
+
|
32 |
+
class PitchPredictor(torch.nn.Module):
|
33 |
+
def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5,
|
34 |
+
dropout_rate=0.1, padding='SAME'):
|
35 |
+
"""Initilize pitch predictor module.
|
36 |
+
Args:
|
37 |
+
idim (int): Input dimension.
|
38 |
+
n_layers (int, optional): Number of convolutional layers.
|
39 |
+
n_chans (int, optional): Number of channels of convolutional layers.
|
40 |
+
kernel_size (int, optional): Kernel size of convolutional layers.
|
41 |
+
dropout_rate (float, optional): Dropout rate.
|
42 |
+
"""
|
43 |
+
super(PitchPredictor, self).__init__()
|
44 |
+
self.conv = torch.nn.ModuleList()
|
45 |
+
self.kernel_size = kernel_size
|
46 |
+
self.padding = padding
|
47 |
+
for idx in range(n_layers):
|
48 |
+
in_chans = idim if idx == 0 else n_chans
|
49 |
+
self.conv += [torch.nn.Sequential(
|
50 |
+
torch.nn.ConstantPad1d(((kernel_size - 1) // 2, (kernel_size - 1) // 2)
|
51 |
+
if padding == 'SAME'
|
52 |
+
else (kernel_size - 1, 0), 0),
|
53 |
+
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=0),
|
54 |
+
torch.nn.ReLU(),
|
55 |
+
LayerNorm(n_chans, dim=1),
|
56 |
+
torch.nn.Dropout(dropout_rate)
|
57 |
+
)]
|
58 |
+
self.linear = torch.nn.Linear(n_chans, odim)
|
59 |
+
self.embed_positions = SinusoidalPositionalEmbedding(idim, 0, init_size=4096)
|
60 |
+
self.pos_embed_alpha = nn.Parameter(torch.Tensor([1]))
|
61 |
+
|
62 |
+
def forward(self, xs):
|
63 |
+
"""
|
64 |
+
|
65 |
+
:param xs: [B, T, H]
|
66 |
+
:return: [B, T, H]
|
67 |
+
"""
|
68 |
+
positions = self.pos_embed_alpha * self.embed_positions(xs[..., 0])
|
69 |
+
xs = xs + positions
|
70 |
+
xs = xs.transpose(1, -1) # (B, idim, Tmax)
|
71 |
+
for f in self.conv:
|
72 |
+
xs = f(xs) # (B, C, Tmax)
|
73 |
+
# NOTE: calculate in log domain
|
74 |
+
xs = self.linear(xs.transpose(1, -1)) # (B, Tmax, H)
|
75 |
+
return xs
|
76 |
+
|
77 |
+
|
78 |
+
class SvcEncoder(nn.Module):
|
79 |
+
def __init__(self, dictionary, out_dims=None):
|
80 |
+
super().__init__()
|
81 |
+
# self.dictionary = dictionary
|
82 |
+
self.padding_idx = 0
|
83 |
+
self.hidden_size = hparams['hidden_size']
|
84 |
+
self.out_dims = out_dims
|
85 |
+
if out_dims is None:
|
86 |
+
self.out_dims = hparams['audio_num_mel_bins']
|
87 |
+
self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True)
|
88 |
+
predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
|
89 |
+
if hparams['use_pitch_embed']:
|
90 |
+
self.pitch_embed = Embedding(300, self.hidden_size, self.padding_idx)
|
91 |
+
self.pitch_predictor = PitchPredictor(
|
92 |
+
self.hidden_size,
|
93 |
+
n_chans=predictor_hidden,
|
94 |
+
n_layers=hparams['predictor_layers'],
|
95 |
+
dropout_rate=hparams['predictor_dropout'],
|
96 |
+
odim=2 if hparams['pitch_type'] == 'frame' else 1,
|
97 |
+
padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel'])
|
98 |
+
if hparams['use_energy_embed']:
|
99 |
+
self.energy_embed = Embedding(256, self.hidden_size, self.padding_idx)
|
100 |
+
if hparams['use_spk_id']:
|
101 |
+
self.spk_embed_proj = Embedding(hparams['num_spk'], self.hidden_size)
|
102 |
+
if hparams['use_split_spk_id']:
|
103 |
+
self.spk_embed_f0 = Embedding(hparams['num_spk'], self.hidden_size)
|
104 |
+
self.spk_embed_dur = Embedding(hparams['num_spk'], self.hidden_size)
|
105 |
+
elif hparams['use_spk_embed']:
|
106 |
+
self.spk_embed_proj = Linear(256, self.hidden_size, bias=True)
|
107 |
+
|
108 |
+
def forward(self, hubert, mel2ph=None, spk_embed=None,
|
109 |
+
ref_mels=None, f0=None, uv=None, energy=None, skip_decoder=True,
|
110 |
+
spk_embed_dur_id=None, spk_embed_f0_id=None, infer=False, **kwargs):
|
111 |
+
ret = {}
|
112 |
+
encoder_out = hubert
|
113 |
+
src_nonpadding = (hubert != 0).any(-1)[:, :, None]
|
114 |
+
|
115 |
+
# add ref style embed
|
116 |
+
# Not implemented
|
117 |
+
# variance encoder
|
118 |
+
var_embed = 0
|
119 |
+
|
120 |
+
# encoder_out_dur denotes encoder outputs for duration predictor
|
121 |
+
# in speech adaptation, duration predictor use old speaker embedding
|
122 |
+
if hparams['use_spk_embed']:
|
123 |
+
spk_embed_dur = spk_embed_f0 = spk_embed = self.spk_embed_proj(spk_embed)[:, None, :]
|
124 |
+
elif hparams['use_spk_id']:
|
125 |
+
spk_embed_id = spk_embed
|
126 |
+
if spk_embed_dur_id is None:
|
127 |
+
spk_embed_dur_id = spk_embed_id
|
128 |
+
if spk_embed_f0_id is None:
|
129 |
+
spk_embed_f0_id = spk_embed_id
|
130 |
+
spk_embed_0 = self.spk_embed_proj(spk_embed_id.to(hubert.device))[:, None, :]
|
131 |
+
spk_embed_1 = self.spk_embed_proj(torch.LongTensor([0]).to(hubert.device))[:, None, :]
|
132 |
+
spk_embed_2 = self.spk_embed_proj(torch.LongTensor([0]).to(hubert.device))[:, None, :]
|
133 |
+
spk_embed = 1 * spk_embed_0 + 0 * spk_embed_1 + 0 * spk_embed_2
|
134 |
+
spk_embed_dur = spk_embed_f0 = spk_embed
|
135 |
+
if hparams['use_split_spk_id']:
|
136 |
+
spk_embed_dur = self.spk_embed_dur(spk_embed_dur_id)[:, None, :]
|
137 |
+
spk_embed_f0 = self.spk_embed_f0(spk_embed_f0_id)[:, None, :]
|
138 |
+
else:
|
139 |
+
spk_embed_dur = spk_embed_f0 = spk_embed = 0
|
140 |
+
|
141 |
+
ret['mel2ph'] = mel2ph
|
142 |
+
|
143 |
+
decoder_inp = F.pad(encoder_out, [0, 0, 1, 0])
|
144 |
+
|
145 |
+
mel2ph_ = mel2ph[..., None].repeat([1, 1, encoder_out.shape[-1]])
|
146 |
+
decoder_inp_origin = decoder_inp = torch.gather(decoder_inp, 1, mel2ph_) # [B, T, H]
|
147 |
+
|
148 |
+
tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
|
149 |
+
|
150 |
+
# add pitch and energy embed
|
151 |
+
pitch_inp = (decoder_inp_origin + var_embed + spk_embed_f0) * tgt_nonpadding
|
152 |
+
if hparams['use_pitch_embed']:
|
153 |
+
pitch_inp_ph = (encoder_out + var_embed + spk_embed_f0) * src_nonpadding
|
154 |
+
decoder_inp = decoder_inp + self.add_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out=pitch_inp_ph)
|
155 |
+
if hparams['use_energy_embed']:
|
156 |
+
decoder_inp = decoder_inp + self.add_energy(pitch_inp, energy, ret)
|
157 |
+
|
158 |
+
ret['decoder_inp'] = decoder_inp = (decoder_inp + spk_embed) * tgt_nonpadding
|
159 |
+
return ret
|
160 |
+
|
161 |
+
def add_dur(self, dur_input, mel2ph, hubert, ret):
|
162 |
+
src_padding = (hubert == 0).all(-1)
|
163 |
+
dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
|
164 |
+
if mel2ph is None:
|
165 |
+
dur, xs = self.dur_predictor.inference(dur_input, src_padding)
|
166 |
+
ret['dur'] = xs
|
167 |
+
ret['dur_choice'] = dur
|
168 |
+
mel2ph = self.length_regulator(dur, src_padding).detach()
|
169 |
+
else:
|
170 |
+
ret['dur'] = self.dur_predictor(dur_input, src_padding)
|
171 |
+
ret['mel2ph'] = mel2ph
|
172 |
+
return mel2ph
|
173 |
+
|
174 |
+
def run_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs):
|
175 |
+
x = decoder_inp # [B, T, H]
|
176 |
+
x = self.mel_out(x)
|
177 |
+
return x * tgt_nonpadding
|
178 |
+
|
179 |
+
def out2mel(self, out):
|
180 |
+
return out
|
181 |
+
|
182 |
+
def add_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None):
|
183 |
+
decoder_inp = decoder_inp.detach() + hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
|
184 |
+
|
185 |
+
pitch_padding = (mel2ph == 0)
|
186 |
+
ret['f0_denorm'] = f0_denorm = denorm_f0(f0, uv, hparams, pitch_padding=pitch_padding)
|
187 |
+
if pitch_padding is not None:
|
188 |
+
f0[pitch_padding] = 0
|
189 |
+
|
190 |
+
pitch = f0_to_coarse(f0_denorm, hparams) # start from 0
|
191 |
+
ret['pitch_pred'] = pitch.unsqueeze(-1)
|
192 |
+
pitch_embedding = self.pitch_embed(pitch)
|
193 |
+
return pitch_embedding
|
194 |
+
|
195 |
+
def add_energy(self, decoder_inp, energy, ret):
|
196 |
+
decoder_inp = decoder_inp.detach() + hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
|
197 |
+
ret['energy_pred'] = energy # energy_pred = self.energy_predictor(decoder_inp)[:, :, 0]
|
198 |
+
energy = torch.clamp(energy * 256 // 4, max=255).long() # energy_to_coarse
|
199 |
+
energy_embedding = self.energy_embed(energy)
|
200 |
+
return energy_embedding
|
201 |
+
|
202 |
+
@staticmethod
|
203 |
+
def mel_norm(x):
|
204 |
+
return (x + 5.5) / (6.3 / 2) - 1
|
205 |
+
|
206 |
+
@staticmethod
|
207 |
+
def mel_denorm(x):
|
208 |
+
return (x + 1) * (6.3 / 2) - 5.5
|
modules/hubert/__pycache__/cn_hubert.cpython-38.pyc
ADDED
Binary file (1.32 kB). View file
|
|
modules/hubert/__pycache__/hubert_model.cpython-38.pyc
ADDED
Binary file (8.38 kB). View file
|
|
modules/hubert/__pycache__/hubert_onnx.cpython-38.pyc
ADDED
Binary file (735 Bytes). View file
|
|
modules/hubert/cn_hubert.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
|
5 |
+
|
6 |
+
def load_cn_model(ch_hubert_path):
|
7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
8 |
+
from fairseq import checkpoint_utils
|
9 |
+
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
10 |
+
[ch_hubert_path],
|
11 |
+
suffix="",
|
12 |
+
)
|
13 |
+
model = models[0]
|
14 |
+
model = model.to(device)
|
15 |
+
model.eval()
|
16 |
+
return model
|
17 |
+
|
18 |
+
|
19 |
+
def get_cn_hubert_units(con_model, audio_path, dev):
|
20 |
+
audio, sampling_rate = librosa.load(audio_path)
|
21 |
+
if len(audio.shape) > 1:
|
22 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
23 |
+
if sampling_rate != 16000:
|
24 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
25 |
+
|
26 |
+
feats = torch.from_numpy(audio).float()
|
27 |
+
if feats.dim() == 2: # double channels
|
28 |
+
feats = feats.mean(-1)
|
29 |
+
assert feats.dim() == 1, feats.dim()
|
30 |
+
feats = feats.view(1, -1)
|
31 |
+
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
32 |
+
inputs = {
|
33 |
+
"source": feats.to(dev),
|
34 |
+
"padding_mask": padding_mask.to(dev),
|
35 |
+
"output_layer": 9, # layer 9
|
36 |
+
}
|
37 |
+
with torch.no_grad():
|
38 |
+
logits = con_model.extract_features(**inputs)
|
39 |
+
feats = con_model.final_proj(logits[0])
|
40 |
+
return feats
|
modules/hubert/hubert_model.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import random
|
3 |
+
from typing import Optional, Tuple
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
import torch
|
7 |
+
import torch.nn as nn
|
8 |
+
import torch.nn.functional as t_func
|
9 |
+
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
10 |
+
|
11 |
+
|
12 |
+
class Hubert(nn.Module):
|
13 |
+
def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
|
14 |
+
super().__init__()
|
15 |
+
self._mask = mask
|
16 |
+
self.feature_extractor = FeatureExtractor()
|
17 |
+
self.feature_projection = FeatureProjection()
|
18 |
+
self.positional_embedding = PositionalConvEmbedding()
|
19 |
+
self.norm = nn.LayerNorm(768)
|
20 |
+
self.dropout = nn.Dropout(0.1)
|
21 |
+
self.encoder = TransformerEncoder(
|
22 |
+
nn.TransformerEncoderLayer(
|
23 |
+
768, 12, 3072, activation="gelu", batch_first=True
|
24 |
+
),
|
25 |
+
12,
|
26 |
+
)
|
27 |
+
self.proj = nn.Linear(768, 256)
|
28 |
+
|
29 |
+
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
|
30 |
+
self.label_embedding = nn.Embedding(num_label_embeddings, 256)
|
31 |
+
|
32 |
+
def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
33 |
+
mask = None
|
34 |
+
if self.training and self._mask:
|
35 |
+
mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
|
36 |
+
x[mask] = self.masked_spec_embed.to(x.dtype)
|
37 |
+
return x, mask
|
38 |
+
|
39 |
+
def encode(
|
40 |
+
self, x: torch.Tensor, layer: Optional[int] = None
|
41 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
42 |
+
x = self.feature_extractor(x)
|
43 |
+
x = self.feature_projection(x.transpose(1, 2))
|
44 |
+
x, mask = self.mask(x)
|
45 |
+
x = x + self.positional_embedding(x)
|
46 |
+
x = self.dropout(self.norm(x))
|
47 |
+
x = self.encoder(x, output_layer=layer)
|
48 |
+
return x, mask
|
49 |
+
|
50 |
+
def logits(self, x: torch.Tensor) -> torch.Tensor:
|
51 |
+
logits = torch.cosine_similarity(
|
52 |
+
x.unsqueeze(2),
|
53 |
+
self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
|
54 |
+
dim=-1,
|
55 |
+
)
|
56 |
+
return logits / 0.1
|
57 |
+
|
58 |
+
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
59 |
+
x, mask = self.encode(x)
|
60 |
+
x = self.proj(x)
|
61 |
+
logits = self.logits(x)
|
62 |
+
return logits, mask
|
63 |
+
|
64 |
+
|
65 |
+
class HubertSoft(Hubert):
|
66 |
+
def __init__(self):
|
67 |
+
super().__init__()
|
68 |
+
|
69 |
+
# @torch.inference_mode()
|
70 |
+
def units(self, wav: torch.Tensor) -> torch.Tensor:
|
71 |
+
wav = torch.nn.functional.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
|
72 |
+
x, _ = self.encode(wav)
|
73 |
+
return self.proj(x)
|
74 |
+
|
75 |
+
def forward(self, wav: torch.Tensor):
|
76 |
+
return self.units(wav)
|
77 |
+
|
78 |
+
|
79 |
+
class FeatureExtractor(nn.Module):
|
80 |
+
def __init__(self):
|
81 |
+
super().__init__()
|
82 |
+
self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
|
83 |
+
self.norm0 = nn.GroupNorm(512, 512)
|
84 |
+
self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
|
85 |
+
self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
|
86 |
+
self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
|
87 |
+
self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
|
88 |
+
self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
|
89 |
+
self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
|
90 |
+
|
91 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
92 |
+
x = t_func.gelu(self.norm0(self.conv0(x)))
|
93 |
+
x = t_func.gelu(self.conv1(x))
|
94 |
+
x = t_func.gelu(self.conv2(x))
|
95 |
+
x = t_func.gelu(self.conv3(x))
|
96 |
+
x = t_func.gelu(self.conv4(x))
|
97 |
+
x = t_func.gelu(self.conv5(x))
|
98 |
+
x = t_func.gelu(self.conv6(x))
|
99 |
+
return x
|
100 |
+
|
101 |
+
|
102 |
+
class FeatureProjection(nn.Module):
|
103 |
+
def __init__(self):
|
104 |
+
super().__init__()
|
105 |
+
self.norm = nn.LayerNorm(512)
|
106 |
+
self.projection = nn.Linear(512, 768)
|
107 |
+
self.dropout = nn.Dropout(0.1)
|
108 |
+
|
109 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
110 |
+
x = self.norm(x)
|
111 |
+
x = self.projection(x)
|
112 |
+
x = self.dropout(x)
|
113 |
+
return x
|
114 |
+
|
115 |
+
|
116 |
+
class PositionalConvEmbedding(nn.Module):
|
117 |
+
def __init__(self):
|
118 |
+
super().__init__()
|
119 |
+
self.conv = nn.Conv1d(
|
120 |
+
768,
|
121 |
+
768,
|
122 |
+
kernel_size=128,
|
123 |
+
padding=128 // 2,
|
124 |
+
groups=16,
|
125 |
+
)
|
126 |
+
self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
|
127 |
+
|
128 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
129 |
+
x = self.conv(x.transpose(1, 2))
|
130 |
+
x = t_func.gelu(x[:, :, :-1])
|
131 |
+
return x.transpose(1, 2)
|
132 |
+
|
133 |
+
|
134 |
+
class TransformerEncoder(nn.Module):
|
135 |
+
def __init__(
|
136 |
+
self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
|
137 |
+
) -> None:
|
138 |
+
super(TransformerEncoder, self).__init__()
|
139 |
+
self.layers = nn.ModuleList(
|
140 |
+
[copy.deepcopy(encoder_layer) for _ in range(num_layers)]
|
141 |
+
)
|
142 |
+
self.num_layers = num_layers
|
143 |
+
|
144 |
+
def forward(
|
145 |
+
self,
|
146 |
+
src: torch.Tensor,
|
147 |
+
mask: torch.Tensor = None,
|
148 |
+
src_key_padding_mask: torch.Tensor = None,
|
149 |
+
output_layer: Optional[int] = None,
|
150 |
+
) -> torch.Tensor:
|
151 |
+
output = src
|
152 |
+
for layer in self.layers[:output_layer]:
|
153 |
+
output = layer(
|
154 |
+
output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
|
155 |
+
)
|
156 |
+
return output
|
157 |
+
|
158 |
+
|
159 |
+
def _compute_mask(
|
160 |
+
shape: Tuple[int, int],
|
161 |
+
mask_prob: float,
|
162 |
+
mask_length: int,
|
163 |
+
device: torch.device,
|
164 |
+
min_masks: int = 0,
|
165 |
+
) -> torch.Tensor:
|
166 |
+
batch_size, sequence_length = shape
|
167 |
+
|
168 |
+
if mask_length < 1:
|
169 |
+
raise ValueError("`mask_length` has to be bigger than 0.")
|
170 |
+
|
171 |
+
if mask_length > sequence_length:
|
172 |
+
raise ValueError(
|
173 |
+
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
174 |
+
)
|
175 |
+
|
176 |
+
# compute number of masked spans in batch
|
177 |
+
num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
|
178 |
+
num_masked_spans = max(num_masked_spans, min_masks)
|
179 |
+
|
180 |
+
# make sure num masked indices <= sequence_length
|
181 |
+
if num_masked_spans * mask_length > sequence_length:
|
182 |
+
num_masked_spans = sequence_length // mask_length
|
183 |
+
|
184 |
+
# SpecAugment mask to fill
|
185 |
+
mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
|
186 |
+
|
187 |
+
# uniform distribution to sample from, make sure that offset samples are < sequence_length
|
188 |
+
uniform_dist = torch.ones(
|
189 |
+
(batch_size, sequence_length - (mask_length - 1)), device=device
|
190 |
+
)
|
191 |
+
|
192 |
+
# get random indices to mask
|
193 |
+
mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
|
194 |
+
|
195 |
+
# expand masked indices to masked spans
|
196 |
+
mask_indices = (
|
197 |
+
mask_indices.unsqueeze(dim=-1)
|
198 |
+
.expand((batch_size, num_masked_spans, mask_length))
|
199 |
+
.reshape(batch_size, num_masked_spans * mask_length)
|
200 |
+
)
|
201 |
+
offsets = (
|
202 |
+
torch.arange(mask_length, device=device)[None, None, :]
|
203 |
+
.expand((batch_size, num_masked_spans, mask_length))
|
204 |
+
.reshape(batch_size, num_masked_spans * mask_length)
|
205 |
+
)
|
206 |
+
mask_idxs = mask_indices + offsets
|
207 |
+
|
208 |
+
# scatter indices to mask
|
209 |
+
mask = mask.scatter(1, mask_idxs, True)
|
210 |
+
|
211 |
+
return mask
|
212 |
+
|
213 |
+
|
214 |
+
def hubert_soft(
|
215 |
+
path: str
|
216 |
+
) -> HubertSoft:
|
217 |
+
r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
|
218 |
+
Args:
|
219 |
+
path (str): path of a pretrained model
|
220 |
+
"""
|
221 |
+
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
222 |
+
hubert = HubertSoft()
|
223 |
+
checkpoint = torch.load(path)
|
224 |
+
consume_prefix_in_state_dict_if_present(checkpoint, "module.")
|
225 |
+
hubert.load_state_dict(checkpoint)
|
226 |
+
hubert.eval().to(dev)
|
227 |
+
return hubert
|
228 |
+
|
229 |
+
|
230 |
+
def get_units(hbt_soft, raw_wav_path, dev=torch.device('cuda')):
|
231 |
+
wav, sr = librosa.load(raw_wav_path, sr=None)
|
232 |
+
assert (sr >= 16000)
|
233 |
+
if len(wav.shape) > 1:
|
234 |
+
wav = librosa.to_mono(wav)
|
235 |
+
if sr != 16000:
|
236 |
+
wav16 = librosa.resample(wav, sr, 16000)
|
237 |
+
else:
|
238 |
+
wav16 = wav
|
239 |
+
dev = torch.device("cuda" if (dev == torch.device('cuda') and torch.cuda.is_available()) else "cpu")
|
240 |
+
torch.cuda.is_available() and torch.cuda.empty_cache()
|
241 |
+
with torch.inference_mode():
|
242 |
+
units = hbt_soft.units(torch.FloatTensor(wav16.astype(float)).unsqueeze(0).unsqueeze(0).to(dev))
|
243 |
+
return units
|
modules/hubert/hubert_onnx.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
|
6 |
+
|
7 |
+
def get_onnx_units(hbt_soft, raw_wav_path):
|
8 |
+
source, sr = torchaudio.load(raw_wav_path)
|
9 |
+
source = torchaudio.functional.resample(source, sr, 16000)
|
10 |
+
if len(source.shape) == 2 and source.shape[1] >= 2:
|
11 |
+
source = torch.mean(source, dim=0).unsqueeze(0)
|
12 |
+
source = source.unsqueeze(0)
|
13 |
+
# 使用ONNX Runtime进行推理
|
14 |
+
start = time.time()
|
15 |
+
units = hbt_soft.run(output_names=["units"],
|
16 |
+
input_feed={"wav": source.numpy()})[0]
|
17 |
+
use_time = time.time() - start
|
18 |
+
print("hubert_onnx_session.run time:{}".format(use_time))
|
19 |
+
return units
|
modules/nsf_hifigan/__pycache__/env.cpython-310.pyc
ADDED
Binary file (813 Bytes). View file
|
|
modules/nsf_hifigan/__pycache__/env.cpython-38.pyc
ADDED
Binary file (799 Bytes). View file
|
|
modules/nsf_hifigan/__pycache__/models.cpython-310.pyc
ADDED
Binary file (16.1 kB). View file
|
|
modules/nsf_hifigan/__pycache__/models.cpython-38.pyc
ADDED
Binary file (16.3 kB). View file
|
|
modules/nsf_hifigan/__pycache__/nvSTFT.cpython-310.pyc
ADDED
Binary file (3.78 kB). View file
|
|
modules/nsf_hifigan/__pycache__/nvSTFT.cpython-38.pyc
ADDED
Binary file (3.84 kB). View file
|
|
modules/nsf_hifigan/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (2.35 kB). View file
|
|