fakufaku commited on
Commit
f0d6f7d
1 Parent(s): ad0342a

UNIVERSE original version

Browse files
Files changed (3) hide show
  1. README.md +106 -0
  2. config.yaml +178 -0
  3. weights.ckpt +3 -0
README.md ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "en"
4
+ thumbnail:
5
+ tags:
6
+ - audio-to-audio
7
+ - Speech Enhancement
8
+ - Voicebank-DEMAND
9
+ - UNIVERSE
10
+ - UNIVERSE++
11
+ - Diffusion
12
+ - pytorch
13
+ - open-universe
14
+ license: "apache-2.0"
15
+ datasets:
16
+ - Voicebank-DEMAND
17
+ metrics:
18
+ - SI-SNR
19
+ - PESQ
20
+ - SIG
21
+ - BAK
22
+ - OVRL
23
+ model-index:
24
+ - name: open-universe
25
+ results:
26
+ - task:
27
+ name: Speech Enhancement
28
+ type: speech-enhancement
29
+ dataset:
30
+ name: Voicebank-DEMAND
31
+ type: voicebank-demand
32
+ split: test-set
33
+ args:
34
+ language: en
35
+ metrics:
36
+ - name: DNSMOS SIG
37
+ type: sig
38
+ value: '3.493'
39
+ - name: DNSMOS BAK
40
+ type: bak
41
+ value: '4.042'
42
+ - name: DNSMOS OVRL
43
+ type: ovrl
44
+ value: '3.205'
45
+ ---
46
+ # Open-UNIVERSE: Generative Speech Enhancement with Score-based Diffusion and Adversarial Training
47
+
48
+ This repository contains the configurations and weights for the [UNIVERSE++](tba) and
49
+ [UNIVERSE](https://arxiv.org/abs/2206.03065) models implemented in [open-universe](https://github.com/line/open-universe).
50
+
51
+ The models were trained on the [Voicebank-DEMAND](https://datashare.ed.ac.uk/handle/10283/2791) dataset at 16 kHz.
52
+
53
+ The performance on the test split of Voicebank-DEMAND is given in the following table.
54
+
55
+ | model | si-sdr | pesq-wb | stoi-ext | lsd | lps | OVRL | SIG | BAK |
56
+ |------------|----------|-----------|------------|-------|-------|--------|-------|-------|
57
+ | UNIVERSE++ | 18.629 | 3.017 | 0.865 | 4.868 | 0.937 | 3.205 | 3.493 | 4.042 |
58
+ | UNIVERSE | 17.594 | 2.834 | 0.845 | 6.318 | 0.920 | 3.156 | 3.455 | 4.013 |
59
+
60
+ ## Usage
61
+
62
+ Start by installing `open-universe`.
63
+ We use conda to simplify the installation.
64
+ ```sh
65
+ git clone https://github.com/line/open-universe.git
66
+ cd open-universe
67
+ conda env create -f environment.yaml
68
+ conda activate open-universe
69
+ python -m pip install .
70
+ ```
71
+
72
+ Then the models can be used as follows.
73
+ ```sh
74
+ # UNIVERSE++
75
+ python -m open_universe.bin.enhance <input/folder> <output/folder>
76
+
77
+ # UNIVERSE
78
+ python -m open_universe.bin.enhance <input/folder> <output/folder> \
79
+ --model line-corporation/open-universe:original
80
+ ```
81
+
82
+ ## Referencing open-universe and UNIVERSE++
83
+
84
+ If you use these models in your work, please consider citing the following paper.
85
+
86
+ ```latex
87
+ @inproceedings{universepp,
88
+ authors={Scheibler, Robin and Fujita, Yusuke and Shirahata, Yuma and Komatsu, Tatsuya},
89
+ title={Universal Score-based Speech Enhancement with High Content Preservation},
90
+ booktitle={Proc. Interspeech 2024},
91
+ month=sep,
92
+ year=2024
93
+ }
94
+ ```
95
+
96
+ ## Referencing UNIVERSE
97
+
98
+ ```latex
99
+ @misc{universe,
100
+ authors={Serr\'a, Joan and Santiago, Pascual and Pons, Jordi and Araz, Oguz R. and Scaini, David},
101
+ title={Universal Speech Enhancement with Score-based Diffusion},
102
+ howpublished={arXiv:2206.03065},
103
+ month=sep,
104
+ year=2022
105
+ }
106
+ ```
config.yaml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ seed: 46762398
2
+ ckpt_path: null
3
+ train: true
4
+ test: false
5
+ path:
6
+ exp_root: exp
7
+ figures: figures
8
+ version_base: null
9
+ datamodule:
10
+ _target_: open_universe.datasets.DataModule
11
+ train:
12
+ dataset: vb-train-16k
13
+ dl_opts:
14
+ pin_memory: true
15
+ num_workers: 6
16
+ shuffle: true
17
+ batch_size: 10
18
+ val:
19
+ dataset: vb-val-16k
20
+ dl_opts:
21
+ pin_memory: true
22
+ num_workers: 4
23
+ shuffle: false
24
+ batch_size: 1
25
+ test:
26
+ dataset: vb-test-16k
27
+ dl_opts:
28
+ pin_memory: true
29
+ num_workers: 4
30
+ shuffle: false
31
+ batch_size: 1
32
+ datasets:
33
+ vb-train-16k:
34
+ _target_: open_universe.datasets.NoisyDataset
35
+ audio_path: data/voicebank_demand/16k
36
+ fs: 16000
37
+ split: train
38
+ audio_len: 2.0
39
+ augmentation: false
40
+ vb-val-16k:
41
+ _target_: open_universe.datasets.NoisyDataset
42
+ audio_path: ${..vb-train-16k.audio_path}
43
+ fs: ${..vb-train-16k.fs}
44
+ split: val
45
+ audio_len: null
46
+ augmentation: false
47
+ vb-test-16k:
48
+ _target_: open_universe.datasets.NoisyDataset
49
+ audio_path: ${..vb-train-16k.audio_path}
50
+ fs: ${..vb-train-16k.fs}
51
+ split: test
52
+ audio_len: null
53
+ augmentation: false
54
+ vb-train-24k:
55
+ _target_: open_universe.datasets.NoisyDataset
56
+ audio_path: data/voicebank_demand/24k
57
+ fs: 24000
58
+ split: train
59
+ audio_len: 2.0
60
+ augmentation: false
61
+ vb-val-24k:
62
+ _target_: open_universe.datasets.NoisyDataset
63
+ audio_path: ${..vb-train-24k.audio_path}
64
+ fs: ${..vb-train-24k.fs}
65
+ split: val
66
+ audio_len: null
67
+ augmentation: false
68
+ vb-test-24k:
69
+ _target_: open_universe.datasets.NoisyDataset
70
+ audio_path: ${..vb-train-24k.audio_path}
71
+ fs: ${..vb-train-24k.fs}
72
+ split: test
73
+ audio_len: null
74
+ augmentation: false
75
+ model:
76
+ _target_: open_universe.networks.universe.Universe
77
+ fs: 16000
78
+ normalization_norm: 2
79
+ normalization_kwargs:
80
+ ref: both
81
+ level_db: -26.0
82
+ score_model:
83
+ _target_: open_universe.networks.universe.ScoreNetwork
84
+ fb_kernel_size: 3
85
+ rate_factors:
86
+ - 2
87
+ - 4
88
+ - 4
89
+ - 5
90
+ n_channels: 32
91
+ n_rff: 32
92
+ noise_cond_dim: 512
93
+ encoder_gru_conv_sandwich: false
94
+ extra_conv_block: true
95
+ decoder_act_type: prelu
96
+ use_weight_norm: false
97
+ seq_model: gru
98
+ use_antialiasing: false
99
+ condition_model:
100
+ _target_: open_universe.networks.universe.ConditionerNetwork
101
+ fb_kernel_size: ${model.score_model.fb_kernel_size}
102
+ rate_factors: ${model.score_model.rate_factors}
103
+ n_channels: ${model.score_model.n_channels}
104
+ n_mels: 80
105
+ n_mel_oversample: 4
106
+ encoder_gru_residual: true
107
+ extra_conv_block: ${model.score_model.extra_conv_block}
108
+ decoder_act_type: prelu
109
+ use_weight_norm: ${model.score_model.use_weight_norm}
110
+ seq_model: ${model.score_model.seq_model}
111
+ use_antialiasing: false
112
+ diffusion:
113
+ schedule: geometric
114
+ sigma_min: 0.0005
115
+ sigma_max: 5.0
116
+ n_steps: 8
117
+ epsilon: 1.3
118
+ losses:
119
+ weights:
120
+ score: 1.0
121
+ signal: 1.0
122
+ latent: 1.0
123
+ mdn_n_comp: 3
124
+ mdn_alpha_per_sample: true
125
+ score_loss:
126
+ _target_: torch.nn.MSELoss
127
+ training:
128
+ audio_len: ${datamodule.datasets.vb-train-16k.audio_len}
129
+ time_sampling: time_uniform
130
+ dynamic_mixing: false
131
+ ema_decay: 0.999
132
+ validation:
133
+ main_loss: val/pesq
134
+ main_loss_mode: max
135
+ n_bins: 5
136
+ max_enh_batches: 4
137
+ num_tb_samples: 0
138
+ enh_losses:
139
+ val/:
140
+ _target_: open_universe.metrics.EvalMetrics
141
+ audio_fs: ${model.fs}
142
+ optimizer:
143
+ _target_: torch.optim.AdamW
144
+ lr: 0.0002
145
+ weight_decay: 0.01
146
+ weight_decay_exclude:
147
+ - prelu
148
+ - bias
149
+ lr_warmup: null
150
+ betas:
151
+ - 0.8
152
+ - 0.99
153
+ scheduler:
154
+ scheduler:
155
+ _target_: open_universe.utils.schedulers.LinearWarmupCosineAnnealingLR
156
+ T_warmup: 50000
157
+ T_cosine: 50001
158
+ eta_min: 1.6e-06
159
+ T_max: ${trainer.max_steps}
160
+ interval: step
161
+ frequency: 1
162
+ grad_clipper:
163
+ _target_: open_universe.utils.FixedClipper
164
+ max_norm: 1000.0
165
+ trainer:
166
+ _target_: pytorch_lightning.Trainer
167
+ accumulate_grad_batches: 1
168
+ min_epochs: 1
169
+ max_epochs: -1
170
+ max_steps: 300000
171
+ deterministic: warn
172
+ accelerator: gpu
173
+ devices: -1
174
+ strategy: ddp_find_unused_parameters_true
175
+ check_val_every_n_epoch: null
176
+ val_check_interval: 5000
177
+ default_root_dir: .
178
+ profiler: false
weights.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee7b031d055bd65d1e849426ba7867bf1416b53adf46e32c4a69312768361222
3
+ size 901069356