kohei0209 commited on
Commit
c582091
·
verified ·
1 Parent(s): 032e1a4

tfgridnet model fine-tuned to urgent25 track1 data

Browse files
Files changed (41) hide show
  1. train_enh_tfgridnet_dm_raw/config.yaml +298 -0
  2. train_enh_tfgridnet_dm_raw/images/backward_time.png +0 -0
  3. train_enh_tfgridnet_dm_raw/images/clip.png +0 -0
  4. train_enh_tfgridnet_dm_raw/images/forward_time.png +0 -0
  5. train_enh_tfgridnet_dm_raw/images/gpu_max_cached_mem_GB.png +0 -0
  6. train_enh_tfgridnet_dm_raw/images/grad_norm.png +0 -0
  7. train_enh_tfgridnet_dm_raw/images/iter_time.png +0 -0
  8. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_16000Hz.png +0 -0
  9. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_16000Hz_reverb.png +0 -0
  10. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_22050Hz.png +0 -0
  11. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_22050Hz_reverb.png +0 -0
  12. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_24000Hz.png +0 -0
  13. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_24000Hz_reverb.png +0 -0
  14. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_32000Hz.png +0 -0
  15. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_32000Hz_reverb.png +0 -0
  16. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_44100Hz.png +0 -0
  17. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_44100Hz_reverb.png +0 -0
  18. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_48000Hz.png +0 -0
  19. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_48000Hz_reverb.png +0 -0
  20. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_8000Hz.png +0 -0
  21. train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_8000Hz_reverb.png +0 -0
  22. train_enh_tfgridnet_dm_raw/images/loss.png +0 -0
  23. train_enh_tfgridnet_dm_raw/images/loss_scale.png +0 -0
  24. train_enh_tfgridnet_dm_raw/images/optim0_lr0.png +0 -0
  25. train_enh_tfgridnet_dm_raw/images/optim_step_time.png +0 -0
  26. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_16000Hz.png +0 -0
  27. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_16000Hz_reverb.png +0 -0
  28. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_22050Hz.png +0 -0
  29. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_22050Hz_reverb.png +0 -0
  30. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_24000Hz.png +0 -0
  31. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_24000Hz_reverb.png +0 -0
  32. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_32000Hz.png +0 -0
  33. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_32000Hz_reverb.png +0 -0
  34. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_44100Hz.png +0 -0
  35. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_44100Hz_reverb.png +0 -0
  36. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_48000Hz.png +0 -0
  37. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_48000Hz_reverb.png +0 -0
  38. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_8000Hz.png +0 -0
  39. train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_8000Hz_reverb.png +0 -0
  40. train_enh_tfgridnet_dm_raw/images/train_time.png +0 -0
  41. train_enh_tfgridnet_dm_raw/valid.loss.ave_5best.pth +3 -0
train_enh_tfgridnet_dm_raw/config.yaml ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_tfgridnet_dm.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: chunk
7
+ valid_iterator_type: null
8
+ output_dir: exp/enh_train_enh_tfgridnet_dm_raw
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 4
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ cudnn_enabled: true
27
+ cudnn_benchmark: false
28
+ cudnn_deterministic: true
29
+ use_tf32: false
30
+ collect_stats: false
31
+ write_collected_feats: false
32
+ max_epoch: 30
33
+ patience: 5
34
+ val_scheduler_criterion:
35
+ - valid
36
+ - loss
37
+ early_stopping_criterion:
38
+ - valid
39
+ - loss
40
+ - min
41
+ best_model_criterion:
42
+ - - valid
43
+ - loss
44
+ - min
45
+ keep_nbest_models: 5
46
+ nbest_averaging_interval: 0
47
+ grad_clip: 1.0
48
+ grad_clip_type: 2.0
49
+ grad_noise: false
50
+ accum_grad: 1
51
+ no_forward_run: false
52
+ resume: true
53
+ train_dtype: float32
54
+ use_amp: false
55
+ log_interval: null
56
+ use_matplotlib: true
57
+ use_tensorboard: true
58
+ create_graph_in_tensorboard: false
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ use_adapter: false
67
+ adapter: lora
68
+ save_strategy: all
69
+ adapter_conf: {}
70
+ pretrain_path: null
71
+ init_param:
72
+ - exp/enh_train_enh_tfgridnet_raw_1stchallenge/21epoch.pth
73
+ ignore_init_mismatch: false
74
+ freeze_param: []
75
+ num_iters_per_epoch: 4000
76
+ batch_size: 2
77
+ valid_batch_size: 4
78
+ batch_bins: 1000000
79
+ valid_batch_bins: null
80
+ category_sample_size: 10
81
+ train_shape_file:
82
+ - exp/enh_stats_16k/train/speech_mix_shape
83
+ - exp/enh_stats_16k/train/speech_ref1_shape
84
+ valid_shape_file:
85
+ - exp/enh_stats_16k/valid/speech_mix_shape
86
+ - exp/enh_stats_16k/valid/speech_ref1_shape
87
+ batch_type: folded
88
+ valid_batch_type: null
89
+ fold_length:
90
+ - 80000
91
+ - 80000
92
+ sort_in_batch: descending
93
+ shuffle_within_batch: false
94
+ sort_batch: descending
95
+ multiple_iterator: false
96
+ chunk_length: 200
97
+ chunk_shift_ratio: 0.5
98
+ num_cache_chunks: 128
99
+ chunk_excluded_key_prefixes: []
100
+ chunk_default_fs: 50
101
+ chunk_max_abs_length: 144000
102
+ chunk_discard_short_samples: true
103
+ train_data_path_and_name_and_type:
104
+ - - dump/raw/speech_train_track1/wav.scp
105
+ - speech_mix
106
+ - sound
107
+ - - dump/raw/speech_train_track1/spk1.scp
108
+ - speech_ref1
109
+ - sound
110
+ - - dump/raw/speech_train_track1/utt2category
111
+ - category
112
+ - text
113
+ - - dump/raw/speech_train_track1/utt2fs
114
+ - fs
115
+ - text_int
116
+ valid_data_path_and_name_and_type:
117
+ - - dump/raw/validation/wav.scp
118
+ - speech_mix
119
+ - sound
120
+ - - dump/raw/validation/spk1.scp
121
+ - speech_ref1
122
+ - sound
123
+ - - dump/raw/validation/utt2category
124
+ - category
125
+ - text
126
+ - - dump/raw/validation/utt2fs
127
+ - fs
128
+ - text_int
129
+ multi_task_dataset: false
130
+ allow_variable_data_keys: false
131
+ max_cache_size: 0.0
132
+ max_cache_fd: 32
133
+ allow_multi_rates: true
134
+ valid_max_cache_size: null
135
+ exclude_weight_decay: false
136
+ exclude_weight_decay_conf: {}
137
+ optim: adam
138
+ optim_conf:
139
+ lr: 0.0001
140
+ eps: 1.0e-08
141
+ weight_decay: 1.0e-05
142
+ scheduler: warmupsteplr
143
+ scheduler_conf:
144
+ step_size: 1
145
+ gamma: 0.98
146
+ warmup_steps: 4000
147
+ init: null
148
+ model_conf:
149
+ normalize_variance_per_ch: true
150
+ categories:
151
+ - 1ch_8000Hz
152
+ - 1ch_16000Hz
153
+ - 1ch_22050Hz
154
+ - 1ch_24000Hz
155
+ - 1ch_32000Hz
156
+ - 1ch_44100Hz
157
+ - 1ch_48000Hz
158
+ - 1ch_8000Hz_reverb
159
+ - 1ch_16000Hz_reverb
160
+ - 1ch_22050Hz_reverb
161
+ - 1ch_24000Hz_reverb
162
+ - 1ch_32000Hz_reverb
163
+ - 1ch_44100Hz_reverb
164
+ - 1ch_48000Hz_reverb
165
+ criterions:
166
+ - name: mr_l1_tfd
167
+ conf:
168
+ window_sz:
169
+ - 256
170
+ - 512
171
+ - 768
172
+ - 1024
173
+ hop_sz: null
174
+ eps: 1.0e-08
175
+ time_domain_weight: 0.5
176
+ normalize_variance: true
177
+ wrapper: fixed_order
178
+ wrapper_conf:
179
+ weight: 1.0
180
+ - name: si_snr
181
+ conf:
182
+ eps: 1.0e-07
183
+ wrapper: fixed_order
184
+ wrapper_conf:
185
+ weight: 0.0
186
+ speech_volume_normalize: null
187
+ rir_scp: null
188
+ rir_apply_prob: 1.0
189
+ noise_scp: null
190
+ noise_apply_prob: 1.0
191
+ noise_db_range: '13_15'
192
+ short_noise_thres: 0.5
193
+ use_reverberant_ref: false
194
+ num_spk: 1
195
+ num_noise_type: 1
196
+ sample_rate: 8000
197
+ force_single_channel: false
198
+ channel_reordering: false
199
+ categories: []
200
+ speech_segment: null
201
+ avoid_allzero_segment: true
202
+ flexible_numspk: false
203
+ dynamic_mixing: false
204
+ utt2spk: null
205
+ dynamic_mixing_gain_db: 0.0
206
+ encoder: stft
207
+ encoder_conf:
208
+ n_fft: 256
209
+ hop_length: 128
210
+ use_builtin_complex: true
211
+ default_fs: 8000
212
+ separator: tfgridnetv3
213
+ separator_conf:
214
+ n_srcs: 1
215
+ n_imics: 1
216
+ n_layers: 6
217
+ lstm_hidden_units: 200
218
+ attn_n_head: 4
219
+ attn_qk_output_channel: 2
220
+ emb_dim: 48
221
+ emb_ks: 4
222
+ emb_hs: 1
223
+ activation: prelu
224
+ eps: 1.0e-05
225
+ decoder: stft
226
+ decoder_conf:
227
+ n_fft: 256
228
+ hop_length: 128
229
+ default_fs: 8000
230
+ mask_module: multi_mask
231
+ mask_module_conf: {}
232
+ preprocessor: enh
233
+ preprocessor_conf:
234
+ speech_volume_normalize: 0.5_1.0
235
+ rir_scp: dump/raw/rir_train.scp
236
+ rir_apply_prob: 0.5
237
+ noise_scp: dump/raw/noise_train.scp
238
+ noise_apply_prob: 1.0
239
+ noise_db_range: '-5_15'
240
+ force_single_channel: true
241
+ channel_reordering: true
242
+ categories:
243
+ - 1ch_8000Hz
244
+ - 1ch_16000Hz
245
+ - 1ch_22050Hz
246
+ - 1ch_24000Hz
247
+ - 1ch_32000Hz
248
+ - 1ch_44100Hz
249
+ - 1ch_48000Hz
250
+ - 1ch_8000Hz_reverb
251
+ - 1ch_16000Hz_reverb
252
+ - 1ch_22050Hz_reverb
253
+ - 1ch_24000Hz_reverb
254
+ - 1ch_32000Hz_reverb
255
+ - 1ch_44100Hz_reverb
256
+ - 1ch_48000Hz_reverb
257
+ data_aug_effects:
258
+ - - 1.0
259
+ - bandwidth_limitation
260
+ - res_type: random
261
+ - - 1.0
262
+ - clipping
263
+ - min_quantile: 0.1
264
+ max_quantile: 0.9
265
+ - - 1.0
266
+ - - - 0.5
267
+ - codec
268
+ - format: mp3
269
+ encoder: null
270
+ qscale:
271
+ - 1
272
+ - 10
273
+ - - 0.5
274
+ - codec
275
+ - format: ogg
276
+ encoder:
277
+ - vorbis
278
+ - opus
279
+ qscale:
280
+ - -1
281
+ - 10
282
+ - - 1.0
283
+ - packet_loss
284
+ - packet_duration_ms: 20
285
+ packet_loss_rate:
286
+ - 0.05
287
+ - 0.25
288
+ max_continuous_packet_loss: 10
289
+ data_aug_num:
290
+ - 1
291
+ - 3
292
+ data_aug_prob: 0.75
293
+ diffusion_model: null
294
+ diffusion_model_conf: {}
295
+ required:
296
+ - output_dir
297
+ version: '202409'
298
+ distributed: false
train_enh_tfgridnet_dm_raw/images/backward_time.png ADDED
train_enh_tfgridnet_dm_raw/images/clip.png ADDED
train_enh_tfgridnet_dm_raw/images/forward_time.png ADDED
train_enh_tfgridnet_dm_raw/images/gpu_max_cached_mem_GB.png ADDED
train_enh_tfgridnet_dm_raw/images/grad_norm.png ADDED
train_enh_tfgridnet_dm_raw/images/iter_time.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_16000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_16000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_22050Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_22050Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_24000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_24000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_32000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_32000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_44100Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_44100Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_48000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_48000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_8000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/l1_timedomain+magspec_loss_1ch_8000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/loss.png ADDED
train_enh_tfgridnet_dm_raw/images/loss_scale.png ADDED
train_enh_tfgridnet_dm_raw/images/optim0_lr0.png ADDED
train_enh_tfgridnet_dm_raw/images/optim_step_time.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_16000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_16000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_22050Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_22050Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_24000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_24000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_32000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_32000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_44100Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_44100Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_48000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_48000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_8000Hz.png ADDED
train_enh_tfgridnet_dm_raw/images/si_snr_loss_1ch_8000Hz_reverb.png ADDED
train_enh_tfgridnet_dm_raw/images/train_time.png ADDED
train_enh_tfgridnet_dm_raw/valid.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8350b6f84bb5de01646b7cebe9d19d5b1fc4318cd85c31d242caccc2442d276e
3
+ size 34192512