|
--- |
|
tags: |
|
- espnet |
|
- audio |
|
- universa |
|
language: multilingual |
|
datasets: |
|
- urgent24 |
|
license: cc-by-4.0 |
|
--- |
|
|
|
## ESPnet2 universa model |
|
|
|
### `espnet/universa-wavlm_base_urgent24_multi-metric_fullref` |
|
|
|
This model was trained by ftshijt using urgent24 recipe in [espnet](https://github.com/espnet/espnet/). |
|
|
|
### Demo: How to use in ESPnet2 |
|
|
|
Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
|
if you haven't done that already. |
|
|
|
```bash |
|
cd espnet |
|
git checkout 5dbbd4cec6d4ccdd840484207951770027a8d4b8 |
|
pip install -e . |
|
cd egs2/urgent24/uni_versa1 |
|
./run.sh --skip_data_prep false --skip_train true --download_model espnet/universa-wavlm_base_urgent24_multi-metric_fullref |
|
``` |
|
|
|
|
|
|
|
## universa config |
|
|
|
<details><summary>expand</summary> |
|
|
|
``` |
|
config: conf/train_universa_wavlm_freeze.yaml |
|
print_config: false |
|
log_level: INFO |
|
drop_last_iter: false |
|
dry_run: false |
|
iterator_type: sequence |
|
valid_iterator_type: null |
|
output_dir: update_exp/universa_train_universa_wavlm_freeze_raw_fs16000 |
|
ngpu: 1 |
|
seed: 777 |
|
num_workers: 1 |
|
num_att_plot: 0 |
|
dist_backend: nccl |
|
dist_init_method: env:// |
|
dist_world_size: null |
|
dist_rank: null |
|
local_rank: 0 |
|
dist_master_addr: null |
|
dist_master_port: null |
|
dist_launcher: null |
|
multiprocessing_distributed: false |
|
unused_parameters: false |
|
sharded_ddp: false |
|
use_deepspeed: false |
|
deepspeed_config: null |
|
cudnn_enabled: true |
|
cudnn_benchmark: false |
|
cudnn_deterministic: false |
|
use_tf32: false |
|
collect_stats: false |
|
write_collected_feats: false |
|
max_epoch: 100 |
|
patience: null |
|
val_scheduler_criterion: |
|
- valid |
|
- loss |
|
early_stopping_criterion: |
|
- valid |
|
- loss |
|
- min |
|
best_model_criterion: |
|
- - train |
|
- loss |
|
- min |
|
- - valid |
|
- loss |
|
- min |
|
- - train |
|
- acc |
|
- max |
|
- - valid |
|
- acc |
|
- max |
|
keep_nbest_models: 5 |
|
nbest_averaging_interval: 0 |
|
grad_clip: -1 |
|
grad_clip_type: 2.0 |
|
grad_noise: false |
|
accum_grad: 1 |
|
no_forward_run: false |
|
resume: true |
|
train_dtype: float32 |
|
use_amp: false |
|
log_interval: 50 |
|
use_matplotlib: true |
|
use_tensorboard: true |
|
create_graph_in_tensorboard: false |
|
use_wandb: false |
|
wandb_project: null |
|
wandb_id: null |
|
wandb_entity: null |
|
wandb_name: null |
|
wandb_model_log_interval: -1 |
|
detect_anomaly: false |
|
use_adapter: false |
|
adapter: lora |
|
save_strategy: all |
|
adapter_conf: {} |
|
pretrain_path: null |
|
init_param: [] |
|
ignore_init_mismatch: false |
|
freeze_param: |
|
- frontend.upstream |
|
num_iters_per_epoch: null |
|
batch_size: 16 |
|
valid_batch_size: null |
|
batch_bins: 1000000 |
|
valid_batch_bins: null |
|
category_sample_size: 10 |
|
train_shape_file: |
|
- update_exp/universa_stats_raw/train/audio_shape |
|
- update_exp/universa_stats_raw/train/ref_audio_shape |
|
- update_exp/universa_stats_raw/train/ref_text_shape |
|
valid_shape_file: |
|
- update_exp/universa_stats_raw/valid/audio_shape |
|
- update_exp/universa_stats_raw/valid/ref_audio_shape |
|
- update_exp/universa_stats_raw/valid/ref_text_shape |
|
batch_type: sorted |
|
valid_batch_type: null |
|
fold_length: |
|
- 256000 |
|
sort_in_batch: descending |
|
shuffle_within_batch: false |
|
sort_batch: descending |
|
multiple_iterator: false |
|
chunk_length: 500 |
|
chunk_shift_ratio: 0.5 |
|
num_cache_chunks: 1024 |
|
chunk_excluded_key_prefixes: [] |
|
chunk_default_fs: null |
|
chunk_max_abs_length: null |
|
chunk_discard_short_samples: true |
|
train_data_path_and_name_and_type: |
|
- - dump_ark/raw/train_update/wav.scp |
|
- audio |
|
- kaldi_ark |
|
- - dump_ark/raw/train_update/metric.scp |
|
- metrics |
|
- metric |
|
- - dump_ark/raw/train_update/ref_wav.scp |
|
- ref_audio |
|
- kaldi_ark |
|
- - dump_ark/raw/train_update/text |
|
- ref_text |
|
- text |
|
valid_data_path_and_name_and_type: |
|
- - dump_ark/raw/dev_update/wav.scp |
|
- audio |
|
- kaldi_ark |
|
- - dump_ark/raw/dev_update/metric.scp |
|
- metrics |
|
- metric |
|
- - dump_ark/raw/dev_update/ref_wav.scp |
|
- ref_audio |
|
- kaldi_ark |
|
- - dump_ark/raw/dev_update/text |
|
- ref_text |
|
- text |
|
multi_task_dataset: false |
|
allow_variable_data_keys: false |
|
max_cache_size: 0.0 |
|
max_cache_fd: 32 |
|
allow_multi_rates: false |
|
valid_max_cache_size: null |
|
exclude_weight_decay: false |
|
exclude_weight_decay_conf: {} |
|
optim: adamw |
|
optim_conf: |
|
lr: 0.001 |
|
scheduler: warmuplr |
|
scheduler_conf: |
|
warmup_steps: 25000 |
|
metric2id: dump_ark/raw/train_update/metric2id |
|
metric2type: null |
|
metric_pad_value: -100 |
|
token_list: |
|
- <blank> |
|
- <unk> |
|
- s |
|
- ▁ |
|
- t |
|
- e |
|
- ▁the |
|
- i |
|
- a |
|
- o |
|
- ▁a |
|
- r |
|
- ▁to |
|
- d |
|
- ▁and |
|
- '''' |
|
- m |
|
- n |
|
- ing |
|
- u |
|
- y |
|
- p |
|
- c |
|
- ▁of |
|
- l |
|
- ed |
|
- ▁I |
|
- ▁in |
|
- er |
|
- re |
|
- ▁it |
|
- ▁you |
|
- ar |
|
- ▁f |
|
- ▁is |
|
- ▁that |
|
- ',' |
|
- . |
|
- in |
|
- al |
|
- g |
|
- 'on' |
|
- ▁b |
|
- b |
|
- or |
|
- ▁c |
|
- ▁s |
|
- f |
|
- h |
|
- ▁we |
|
- an |
|
- en |
|
- ▁for |
|
- le |
|
- ▁p |
|
- ly |
|
- es |
|
- w |
|
- ▁re |
|
- ▁on |
|
- ▁m |
|
- ▁be |
|
- ic |
|
- ll |
|
- th |
|
- ▁he |
|
- k |
|
- ur |
|
- ve |
|
- ▁with |
|
- ▁so |
|
- ▁from |
|
- ▁was |
|
- v |
|
- ch |
|
- st |
|
- ▁w |
|
- ▁i |
|
- ▁this |
|
- ▁de |
|
- ▁like |
|
- ▁do |
|
- ce |
|
- at |
|
- il |
|
- ck |
|
- ▁A |
|
- ▁have |
|
- ▁not |
|
- ad |
|
- ▁st |
|
- ow |
|
- ro |
|
- ne |
|
- ▁me |
|
- ▁my |
|
- ▁but |
|
- ation |
|
- ▁at |
|
- ▁or |
|
- '-' |
|
- ter |
|
- ent |
|
- ▁B |
|
- ▁n |
|
- ▁know |
|
- ▁t |
|
- out |
|
- ▁are |
|
- nd |
|
- ▁one |
|
- ▁li |
|
- ▁g |
|
- ▁The |
|
- ol |
|
- ion |
|
- te |
|
- ▁go |
|
- ut |
|
- ▁as |
|
- ▁just |
|
- as |
|
- ▁sh |
|
- ▁they |
|
- is |
|
- ▁C |
|
- et |
|
- ▁h |
|
- ▁an |
|
- ▁there |
|
- ▁up |
|
- ▁S |
|
- ▁M |
|
- ▁she |
|
- ▁by |
|
- ▁su |
|
- om |
|
- ▁can |
|
- us |
|
- ▁your |
|
- ng |
|
- ▁con |
|
- el |
|
- ▁us |
|
- ment |
|
- z |
|
- ▁see |
|
- ▁ab |
|
- ▁what |
|
- ▁out |
|
- ▁her |
|
- me |
|
- ate |
|
- ▁all |
|
- ▁th |
|
- ▁if |
|
- ▁right |
|
- ▁his |
|
- ▁ma |
|
- ▁lo |
|
- ▁which |
|
- ide |
|
- ▁P |
|
- ▁more |
|
- ▁then |
|
- ul |
|
- ast |
|
- x |
|
- ight |
|
- ill |
|
- ▁So |
|
- ▁sp |
|
- ▁going |
|
- ▁some |
|
- ure |
|
- ▁their |
|
- ig |
|
- ▁no |
|
- ▁ro |
|
- ▁think |
|
- ▁who |
|
- ▁pro |
|
- ver |
|
- ive |
|
- est |
|
- ▁co |
|
- ▁di |
|
- '0' |
|
- ist |
|
- ▁k |
|
- age |
|
- ▁d |
|
- ▁time |
|
- ▁L |
|
- ies |
|
- ▁will |
|
- ▁man |
|
- ▁when |
|
- ▁D |
|
- les |
|
- ▁F |
|
- ▁want |
|
- ff |
|
- ity |
|
- ▁un |
|
- '?' |
|
- ▁start |
|
- ▁G |
|
- ▁uh |
|
- ▁get |
|
- ok |
|
- ▁take |
|
- ▁po |
|
- li |
|
- ▁ho |
|
- ▁way |
|
- ▁don |
|
- ▁yeah |
|
- ▁really |
|
- ▁say |
|
- ▁look |
|
- ▁good |
|
- ▁ra |
|
- ▁pr |
|
- ▁had |
|
- ttle |
|
- ▁comp |
|
- ort |
|
- ish |
|
- ▁ex |
|
- ally |
|
- ▁sa |
|
- ▁how |
|
- end |
|
- ant |
|
- ▁O |
|
- ▁um |
|
- way |
|
- ance |
|
- ▁other |
|
- ▁two |
|
- ine |
|
- ever |
|
- able |
|
- ▁com |
|
- other |
|
- ▁first |
|
- ▁back |
|
- ▁al |
|
- ers |
|
- ions |
|
- ▁now |
|
- ▁off |
|
- ning |
|
- ▁down |
|
- ▁has |
|
- ▁than |
|
- ▁car |
|
- ▁Th |
|
- very |
|
- ice |
|
- ▁dr |
|
- ▁been |
|
- ▁him |
|
- ▁here |
|
- ated |
|
- '5' |
|
- ▁hand |
|
- ▁day |
|
- ▁hear |
|
- each |
|
- ▁would |
|
- ▁over |
|
- ▁oh |
|
- ▁cha |
|
- ood |
|
- ▁did |
|
- ugh |
|
- ▁per |
|
- ▁let |
|
- ▁str |
|
- ▁tra |
|
- ▁got |
|
- ext |
|
- '1' |
|
- ▁We |
|
- ▁Shields |
|
- ▁come |
|
- ▁should |
|
- ▁could |
|
- light |
|
- '2' |
|
- ▁people |
|
- ▁again |
|
- ▁year |
|
- ▁app |
|
- ▁into |
|
- ▁any |
|
- ▁N |
|
- ▁mean |
|
- ▁o |
|
- ▁mus |
|
- ▁lot |
|
- ▁said |
|
- ▁long |
|
- ▁these |
|
- ▁lea |
|
- sh |
|
- ▁vi |
|
- ▁part |
|
- ▁every |
|
- ▁our |
|
- ▁You |
|
- ious |
|
- ▁fight |
|
- ▁Ch |
|
- ark |
|
- ▁may |
|
- ▁Hammer |
|
- ▁because |
|
- ▁most |
|
- ▁came |
|
- ▁four |
|
- ful |
|
- ▁No |
|
- ize |
|
- ▁where |
|
- ▁okay |
|
- ▁much |
|
- ▁ask |
|
- ▁through |
|
- ▁before |
|
- ▁work |
|
- ▁even |
|
- ▁three |
|
- mber |
|
- ▁win |
|
- ▁flight |
|
- ake |
|
- K |
|
- ▁place |
|
- ▁play |
|
- ▁though |
|
- ▁pound |
|
- ▁bit |
|
- land |
|
- ▁va |
|
- ▁talk |
|
- ▁kind |
|
- ▁Line |
|
- ▁make |
|
- hap |
|
- ▁big |
|
- ▁leav |
|
- ▁something |
|
- ▁game |
|
- ▁under |
|
- ▁feel |
|
- self |
|
- ▁give |
|
- ▁includ |
|
- U |
|
- ▁twenty |
|
- ▁guard |
|
- ▁left |
|
- ▁round |
|
- ▁great |
|
- body |
|
- ▁gra |
|
- ress |
|
- lso |
|
- '3' |
|
- ▁everything |
|
- ▁those |
|
- ▁after |
|
- ▁tell |
|
- ▁need |
|
- ▁yes |
|
- qua |
|
- ham |
|
- ▁minutes |
|
- ▁question |
|
- ▁around |
|
- ▁punch |
|
- ▁course |
|
- ▁gonna |
|
- ▁person |
|
- ▁move |
|
- ▁plan |
|
- ▁ear |
|
- ept |
|
- ▁Airport |
|
- ▁Okay |
|
- ▁found |
|
- ▁seven |
|
- ▁help |
|
- que |
|
- ▁qui |
|
- ▁keep |
|
- ▁guys |
|
- ▁house |
|
- ▁run |
|
- ▁turn |
|
- ▁better |
|
- ▁stop |
|
- ward |
|
- ddle |
|
- ▁second |
|
- ground |
|
- ▁world |
|
- ▁high |
|
- ▁point |
|
- ▁hold |
|
- ▁call |
|
- '6' |
|
- ▁actually |
|
- ▁probably |
|
- ▁heaven |
|
- ▁speci |
|
- ▁everyone |
|
- ▁why |
|
- ▁presen |
|
- ▁thir |
|
- lright |
|
- ▁eye |
|
- eath |
|
- ▁Tak |
|
- '!' |
|
- '"' |
|
- '4' |
|
- ▁hundred |
|
- ▁answer |
|
- ▁small |
|
- ▁wait |
|
- ▁nothing |
|
- q |
|
- '8' |
|
- V |
|
- ▁countr |
|
- ▁problem |
|
- ▁continu |
|
- ▁close |
|
- ▁priva |
|
- ▁20 |
|
- ▁pleas |
|
- ▁walk |
|
- ▁open |
|
- ▁lay |
|
- ▁Station |
|
- ▁moment |
|
- ▁Yeah |
|
- ▁public |
|
- possibl |
|
- ▁happen |
|
- together |
|
- ▁while |
|
- asically |
|
- ▁money |
|
- ▁wrong |
|
- B |
|
- ▁puzzle |
|
- '7' |
|
- ▁journ |
|
- ▁rainbow |
|
- ▁thousand |
|
- I |
|
- '9' |
|
- S |
|
- P |
|
- '%' |
|
- A |
|
- D |
|
- L |
|
- F |
|
- ’ |
|
- O |
|
- G |
|
- N |
|
- á |
|
- C |
|
- $ |
|
- Z |
|
- Y |
|
- R |
|
- E |
|
- J |
|
- W |
|
- M |
|
- H |
|
- j |
|
- – |
|
- ; |
|
- Q |
|
- X |
|
- ']' |
|
- − |
|
- '&' |
|
- T |
|
- '[' |
|
- <sos/eos> |
|
init: xavier_uniform |
|
model_conf: {} |
|
use_ref_audio: true |
|
use_ref_text: true |
|
use_preprocessor: true |
|
token_type: bpe |
|
bpemodel: data/token_list/bpe_unigram500/bpe.model |
|
non_linguistic_symbols: null |
|
cleaner: null |
|
g2p: null |
|
frontend: s3prl |
|
frontend_conf: |
|
frontend_conf: |
|
upstream: wavlm_large |
|
download_dir: ./hub |
|
multilayer_feature: true |
|
universa: base |
|
universa_conf: |
|
embedding_dim: 256 |
|
audio_encoder_type: transformer |
|
audio_encoder_params: |
|
num_blocks: 4 |
|
attention_heads: 4 |
|
linear_units: 1024 |
|
dropout_rate: 0.1 |
|
positional_dropout_rate: 0.1 |
|
attention_dropout_rate: 0.1 |
|
input_layer: conv2d |
|
normalize_before: true |
|
concat_after: false |
|
positionwise_layer_type: linear |
|
positionwise_conv_kernel_size: 1 |
|
layer_drop_rate: 0.1 |
|
qk_norm: false |
|
use_flash_attn: false |
|
text_encoder_type: transformer |
|
text_encoder_params: |
|
num_blocks: 4 |
|
attention_heads: 4 |
|
linear_units: 1024 |
|
dropout_rate: 0.1 |
|
positional_dropout_rate: 0.1 |
|
attention_dropout_rate: 0.1 |
|
input_layer: linear |
|
normalize_before: true |
|
concat_after: false |
|
positionwise_layer_type: linear |
|
positionwise_conv_kernel_size: 1 |
|
layer_drop_rate: 0.1 |
|
qk_norm: false |
|
use_flash_attn: false |
|
cross_attention_type: multihead |
|
cross_attention_params: |
|
n_head: 4 |
|
dropout_rate: 0.1 |
|
pooling_type: mean |
|
projector_type: linear |
|
multi_branch: true |
|
required: |
|
- output_dir |
|
- metric2id |
|
version: '202412' |
|
distributed: false |
|
``` |
|
|
|
</details> |
|
|
|
|
|
|
|
### Citing ESPnet |
|
|
|
```BibTex |
|
@inproceedings{watanabe2018espnet, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
|
year={2018}, |
|
booktitle={Proceedings of Interspeech}, |
|
pages={2207--2211}, |
|
doi={10.21437/Interspeech.2018-1456}, |
|
url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
or arXiv: |
|
|
|
```bibtex |
|
@misc{watanabe2018espnet, |
|
title={ESPnet: End-to-End Speech Processing Toolkit}, |
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
year={2018}, |
|
eprint={1804.00015}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |
|
|