line-corporation
/

open-universe

+---
+language:
+  - "en"
+thumbnail:
+tags:
+- audio-to-audio
+- Speech Enhancement
+- Voicebank-DEMAND
+- UNIVERSE
+- UNIVERSE++
+- Diffusion
+- pytorch
+- open-universe
+license: "apache-2.0"
+datasets:
+- Voicebank-DEMAND
+metrics:
+- SI-SNR
+- PESQ
+- SIG
+- BAK
+- OVRL
+model-index:
+- name: open-universe
+  results:
+  - task:
+      name: Speech Enhancement
+      type: speech-enhancement
+    dataset:
+      name: Voicebank-DEMAND
+      type: voicebank-demand
+      split: test-set
+      args:
+        language: en
+    metrics:
+    - name: DNSMOS SIG
+      type: sig
+      value: '3.493'
+    - name: DNSMOS BAK
+      type: bak
+      value: '4.042'
+    - name: DNSMOS OVRL
+      type: ovrl
+      value: '3.205'
+---
+# Open-UNIVERSE: Generative Speech Enhancement with Score-based Diffusion and Adversarial Training
+This repository contains the configurations and weights for the [UNIVERSE++](tba) and
+[UNIVERSE](https://arxiv.org/abs/2206.03065) models implemented in [open-universe](https://github.com/line/open-universe).
+The models were trained on the [Voicebank-DEMAND](https://datashare.ed.ac.uk/handle/10283/2791) dataset at 16 kHz.
+The performance on the test split of Voicebank-DEMAND is given in the following table.
+| model      |   si-sdr |   pesq-wb |   stoi-ext |   lsd |   lps |   OVRL |   SIG |   BAK |
+|------------|----------|-----------|------------|-------|-------|--------|-------|-------|
+| UNIVERSE++ |   18.629 |     3.017 |      0.865 | 4.868 | 0.937 |  3.205 | 3.493 | 4.042 |
+| UNIVERSE   |   17.594 |     2.834 |      0.845 | 6.318 | 0.920 |  3.156 | 3.455 | 4.013 |
+## Usage
+Start by installing `open-universe`.
+We use conda to simplify the installation.
+```sh
+git clone https://github.com/line/open-universe.git
+cd open-universe
+conda env create -f environment.yaml
+conda activate open-universe
+python -m pip install .
+```
+Then the models can be used as follows.
+```sh
+# UNIVERSE++
+python -m open_universe.bin.enhance <input/folder> <output/folder>
+# UNIVERSE
+python -m open_universe.bin.enhance <input/folder> <output/folder> \
+  --model line-corporation/open-universe:original
+```
+## Referencing open-universe and UNIVERSE++
+If you use these models in your work, please consider citing the following paper.
+```latex
+@inproceedings{universepp,
+    authors={Scheibler, Robin and Fujita, Yusuke and Shirahata, Yuma and Komatsu, Tatsuya},
+    title={Universal Score-based Speech Enhancement with High Content Preservation},
+    booktitle={Proc. Interspeech 2024},
+    month=sep,
+    year=2024
+}
+```
+## Referencing UNIVERSE
+```latex
+@misc{universe,
+    authors={Serr\'a, Joan and Santiago, Pascual and Pons, Jordi and Araz, Oguz R. and Scaini, David},
+    title={Universal Speech Enhancement with Score-based Diffusion},
+    howpublished={arXiv:2206.03065},
+    month=sep,
+    year=2022
+}
+```

config.yaml ADDED Viewed

	@@ -0,0 +1,178 @@

+seed: 46762398
+ckpt_path: null
+train: true
+test: false
+path:
+  exp_root: exp
+  figures: figures
+version_base: null
+datamodule:
+  _target_: open_universe.datasets.DataModule
+  train:
+    dataset: vb-train-16k
+    dl_opts:
+      pin_memory: true
+      num_workers: 6
+      shuffle: true
+      batch_size: 10
+  val:
+    dataset: vb-val-16k
+    dl_opts:
+      pin_memory: true
+      num_workers: 4
+      shuffle: false
+      batch_size: 1
+  test:
+    dataset: vb-test-16k
+    dl_opts:
+      pin_memory: true
+      num_workers: 4
+      shuffle: false
+      batch_size: 1
+  datasets:
+    vb-train-16k:
+      _target_: open_universe.datasets.NoisyDataset
+      audio_path: data/voicebank_demand/16k
+      fs: 16000
+      split: train
+      audio_len: 2.0
+      augmentation: false
+    vb-val-16k:
+      _target_: open_universe.datasets.NoisyDataset
+      audio_path: ${..vb-train-16k.audio_path}
+      fs: ${..vb-train-16k.fs}
+      split: val
+      audio_len: null
+      augmentation: false
+    vb-test-16k:
+      _target_: open_universe.datasets.NoisyDataset
+      audio_path: ${..vb-train-16k.audio_path}
+      fs: ${..vb-train-16k.fs}
+      split: test
+      audio_len: null
+      augmentation: false
+    vb-train-24k:
+      _target_: open_universe.datasets.NoisyDataset
+      audio_path: data/voicebank_demand/24k
+      fs: 24000
+      split: train
+      audio_len: 2.0
+      augmentation: false
+    vb-val-24k:
+      _target_: open_universe.datasets.NoisyDataset
+      audio_path: ${..vb-train-24k.audio_path}
+      fs: ${..vb-train-24k.fs}
+      split: val
+      audio_len: null
+      augmentation: false
+    vb-test-24k:
+      _target_: open_universe.datasets.NoisyDataset
+      audio_path: ${..vb-train-24k.audio_path}
+      fs: ${..vb-train-24k.fs}
+      split: test
+      audio_len: null
+      augmentation: false
+model:
+  _target_: open_universe.networks.universe.Universe
+  fs: 16000
+  normalization_norm: 2
+  normalization_kwargs:
+    ref: both
+    level_db: -26.0
+  score_model:
+    _target_: open_universe.networks.universe.ScoreNetwork
+    fb_kernel_size: 3
+    rate_factors:
+    - 2
+    - 4
+    - 4
+    - 5
+    n_channels: 32
+    n_rff: 32
+    noise_cond_dim: 512
+    encoder_gru_conv_sandwich: false
+    extra_conv_block: true
+    decoder_act_type: prelu
+    use_weight_norm: false
+    seq_model: gru
+    use_antialiasing: false
+  condition_model:
+    _target_: open_universe.networks.universe.ConditionerNetwork
+    fb_kernel_size: ${model.score_model.fb_kernel_size}
+    rate_factors: ${model.score_model.rate_factors}
+    n_channels: ${model.score_model.n_channels}
+    n_mels: 80
+    n_mel_oversample: 4
+    encoder_gru_residual: true
+    extra_conv_block: ${model.score_model.extra_conv_block}
+    decoder_act_type: prelu
+    use_weight_norm: ${model.score_model.use_weight_norm}
+    seq_model: ${model.score_model.seq_model}
+    use_antialiasing: false
+  diffusion:
+    schedule: geometric
+    sigma_min: 0.0005
+    sigma_max: 5.0
+    n_steps: 8
+    epsilon: 1.3
+  losses:
+    weights:
+      score: 1.0
+      signal: 1.0
+      latent: 1.0
+    mdn_n_comp: 3
+    mdn_alpha_per_sample: true
+    score_loss:
+      _target_: torch.nn.MSELoss
+  training:
+    audio_len: ${datamodule.datasets.vb-train-16k.audio_len}
+    time_sampling: time_uniform
+    dynamic_mixing: false
+    ema_decay: 0.999
+  validation:
+    main_loss: val/pesq
+    main_loss_mode: max
+    n_bins: 5
+    max_enh_batches: 4
+    num_tb_samples: 0
+    enh_losses:
+      val/:
+        _target_: open_universe.metrics.EvalMetrics
+        audio_fs: ${model.fs}
+  optimizer:
+    _target_: torch.optim.AdamW
+    lr: 0.0002
+    weight_decay: 0.01
+    weight_decay_exclude:
+    - prelu
+    - bias
+    lr_warmup: null
+    betas:
+    - 0.8
+    - 0.99
+  scheduler:
+    scheduler:
+      _target_: open_universe.utils.schedulers.LinearWarmupCosineAnnealingLR
+      T_warmup: 50000
+      T_cosine: 50001
+      eta_min: 1.6e-06
+      T_max: ${trainer.max_steps}
+    interval: step
+    frequency: 1
+  grad_clipper:
+    _target_: open_universe.utils.FixedClipper
+    max_norm: 1000.0
+trainer:
+  _target_: pytorch_lightning.Trainer
+  accumulate_grad_batches: 1
+  min_epochs: 1
+  max_epochs: -1
+  max_steps: 300000
+  deterministic: warn
+  accelerator: gpu
+  devices: -1
+  strategy: ddp_find_unused_parameters_true
+  check_val_every_n_epoch: null
+  val_check_interval: 5000
+  default_root_dir: .
+  profiler: false

weights.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee7b031d055bd65d1e849426ba7867bf1416b53adf46e32c4a69312768361222
+size 901069356