|
--- |
|
license: apache-2.0 |
|
language: |
|
- sl |
|
- hr |
|
- sr |
|
- mk |
|
- cs |
|
- bs |
|
- bg |
|
- pl |
|
- ru |
|
- uk |
|
- sk |
|
- sq |
|
pipeline_tag: token-classification |
|
model-index: |
|
- name: xlmr-ner-slavic |
|
results: |
|
- task: |
|
type: token-classification |
|
metrics: |
|
- name: Accuracy |
|
type: Accuracy |
|
value: 98.346 |
|
- name: F1-score |
|
type: F1-score |
|
value: 93.158 |
|
- name: Precision |
|
type: Precision |
|
value: 92.700 |
|
- name: Recall |
|
type: Recall |
|
value: 93.622 |
|
- name: LOC Precision |
|
type: LOC Precision |
|
value: 94.105 |
|
- name: LOC Recall |
|
type: LOC Recall |
|
value: 95.513 |
|
- name: LOC F1-score |
|
type: LOC F1-score |
|
value: 94.804 |
|
- name: MISC Precision |
|
type: MISC Precision |
|
value: 85.196 |
|
- name: MISC Recall |
|
type: MISC Recall |
|
value: 85.545 |
|
- name: MISC F1-score |
|
type: MISC F1-score |
|
value: 85.370 |
|
- name: ORG Precision |
|
type: ORG Precision |
|
value: 91.226 |
|
- name: ORG Recall |
|
type: ORG Recall |
|
value: 91.519 |
|
- name: ORG F1-score |
|
type: ORG F1-score |
|
value: 91.372 |
|
- name: PER Precision |
|
type: PER Precision |
|
value: 94.995 |
|
- name: PER Recall |
|
type: PER Recall |
|
value: 96.191 |
|
- name: PER F1-score |
|
type: PER F1-score |
|
value: 95.589 |
|
--- |
|
## XLM-Roberta-base NER model for slavic languages |
|
|
|
The train / eval / test splits were concatenated from all languages in order as specified in command line: |
|
`sl, hr, sr, bs, mk, sq, cs, bg, pl, ru, sk, uk` |
|
|
|
We used the following hyper-parameters: |
|
|
|
* 256 max-length for tokenizer |
|
* PyTorch's AdamW algorithm with 2e-5 learning rate |
|
* batch size of 20 |
|
* 40 epochs (preliminary runs showed best F1-scores between epochs 15 and 35) |
|
* F1-score for best model selection and training progression. |
|
|
|
<!--- |
|
``` |
|
{ |
|
"xlmrb-sl_hr_sr_bs_mk_sq_cs_bg_pl_ru_sk_uk": { |
|
"LOC": { |
|
"precision": 0.9410536270144608, |
|
"recall": 0.955128974205159, |
|
"f1": 0.9480390600190536, |
|
"number": 25005 |
|
}, |
|
"MISC": { |
|
"precision": 0.8519650655021834, |
|
"recall": 0.8554516223326513, |
|
"f1": 0.8537047841306884, |
|
"number": 6842 |
|
}, |
|
"ORG": { |
|
"precision": 0.9122568093385214, |
|
"recall": 0.915194691129111, |
|
"f1": 0.9137233887075559, |
|
"number": 20494 |
|
}, |
|
"PER": { |
|
"precision": 0.9499552728357022, |
|
"recall": 0.9619061996779388, |
|
"f1": 0.955893384007601, |
|
"number": 19872 |
|
}, |
|
"overall_precision": 0.9269994926711549, |
|
"overall_recall": 0.9362164707185687, |
|
"overall_f1": 0.931585184368627, |
|
"overall_accuracy": 0.9834613206674987 |
|
} |
|
} |
|
``` |
|
--> |
|
Based on |
|
[Analysis of Transfer Learning for Named Entity Recognition in South-Slavic Languages](https://aclanthology.org/2023.bsnlp-1.13) (Ivačič et al., BSNLP 2023) |
|
|
|
## Used NER Corpora |
|
|
|
We used the following NER corpora |
|
|
|
- [Training corpus SUK 1.0](https://www.clarin.si/repository/xmlui/handle/11356/1747) |
|
|
|
``` |
|
@misc{11356/1747, |
|
title = {Training corpus {SUK} 1.0}, |
|
author = {Arhar Holdt, {\v S}pela and Krek, Simon and Dobrovoljc, Kaja and Erjavec, Toma{\v z} and Gantar, Polona and {\v C}ibej, Jaka and Pori, Eva and Ter{\v c}on, Luka and Munda, Tina and {\v Z}itnik, Slavko and Robida, Nejc and Blagus, Neli and Mo{\v z}e, Sara and Ledinek, Nina and Holz, Nanika and Zupan, Katja and Kuzman, Taja and Kav{\v c}i{\v c}, Teja and {\v S}krjanec, Iza and Marko, Dafne and Jezer{\v s}ek, Lucija and Zajc, Anja}, |
|
url = {http://hdl.handle.net/11356/1747}, |
|
note = {Slovenian language resource repository {CLARIN}.{SI}}, |
|
copyright = {Creative Commons - Attribution-{NonCommercial}-{ShareAlike} 4.0 International ({CC} {BY}-{NC}-{SA} 4.0)}, |
|
issn = {2820-4042}, |
|
year = {2022} |
|
} |
|
``` |
|
- [BSNLP: 3rd Shared Task on SlavNER](http://bsnlp.cs.helsinki.fi/shared-task.html) |
|
|
|
We merged 2017+2021 train data with 2021 test data and made custom train / dev / test splits. |
|
|
|
We also mapped EVT (event) and PRO (product) tags to MISC to align the corpus with others. |
|
|
|
You can change mappings running a custom prepare corpus step (see above). |
|
|
|
- [Training corpus hr500k 1.0](https://www.clarin.si/repository/xmlui/handle/11356/1183) |
|
|
|
``` |
|
@misc{11356/1183, |
|
title = {Training corpus hr500k 1.0}, |
|
author = {Ljube{\v s}i{\'c}, Nikola and Agi{\'c}, {\v Z}eljko and Klubi{\v c}ka, Filip and Batanovi{\'c}, Vuk and Erjavec, Toma{\v z}}, |
|
url = {http://hdl.handle.net/11356/1183}, |
|
note = {Slovenian language resource repository {CLARIN}.{SI}}, |
|
copyright = {Creative Commons - Attribution-{ShareAlike} 4.0 International ({CC} {BY}-{SA} 4.0)}, |
|
issn = {2820-4042}, |
|
year = {2018} |
|
} |
|
``` |
|
- [Training corpus SETimes.SR 1.0](https://www.clarin.si/repository/xmlui/handle/11356/1200) |
|
|
|
``` |
|
@misc{11356/1200, |
|
title = {Training corpus {SETimes}.{SR} 1.0}, |
|
author = {Batanovi{\'c}, Vuk and Ljube{\v s}i{\'c}, Nikola and Samard{\v z}i{\'c}, Tanja and Erjavec, Toma{\v z}}, |
|
url = {http://hdl.handle.net/11356/1200}, |
|
note = {Slovenian language resource repository {CLARIN}.{SI}}, |
|
copyright = {Creative Commons - Attribution-{ShareAlike} 4.0 International ({CC} {BY}-{SA} 4.0)}, |
|
issn = {2820-4042}, |
|
year = {2018} |
|
} |
|
``` |
|
|
|
- [Massively Multilingual Transfer for NER.](https://github.com/afshinrahimi/mmner) nick-named WikiAnn |
|
``` |
|
@inproceedings{rahimi-etal-2019-massively, |
|
title = "Massively Multilingual Transfer for {NER}", |
|
author = "Rahimi, Afshin and |
|
Li, Yuan and |
|
Cohn, Trevor", |
|
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", |
|
month = jul, |
|
year = "2019", |
|
address = "Florence, Italy", |
|
publisher = "Association for Computational Linguistics", |
|
url = "https://www.aclweb.org/anthology/P19-1015", |
|
pages = "151--164", |
|
} |
|
``` |
|
|
|
- [Neural Networks for Featureless Named Entity Recognition in Czech.](https://github.com/strakova/ner_tsd2016) |
|
|
|
``` |
|
@Inbook{Strakova2016, |
|
author="Strakov{\'a}, Jana and Straka, Milan and Haji{\v{c}}, Jan", |
|
editor="Sojka, Petr and Hor{\'a}k, Ale{\v{s}} and Kope{\v{c}}ek, Ivan and Pala, Karel", |
|
title="Neural Networks for Featureless Named Entity Recognition in Czech", |
|
bookTitle="Text, Speech, and Dialogue: 19th International Conference, TSD 2016, Brno , Czech Republic, September 12-16, 2016, Proceedings", |
|
year="2016", |
|
publisher="Springer International Publishing", |
|
address="Cham", |
|
pages="173--181", |
|
isbn="978-3-319-45510-5", |
|
doi="10.1007/978-3-319-45510-5_20", |
|
url="http://dx.doi.org/10.1007/978-3-319-45510-5_20" |
|
} |
|
``` |
|
|
|
### NER Evaluation |
|
|
|
For evaluation, we use [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) |
|
``` |
|
@misc{seqeval, |
|
title={{seqeval}: A Python framework for sequence labeling evaluation}, |
|
url={https://github.com/chakki-works/seqeval}, |
|
note={Software available from https://github.com/chakki-works/seqeval}, |
|
author={Hiroki Nakayama}, |
|
year={2018}, |
|
} |
|
``` |
|
|
|
Which is based on |
|
``` |
|
@inproceedings{ramshaw-marcus-1995-text, |
|
title = "Text Chunking using Transformation-Based Learning", |
|
author = "Ramshaw, Lance and |
|
Marcus, Mitch", |
|
booktitle = "Third Workshop on Very Large Corpora", |
|
year = "1995", |
|
url = "https://www.aclweb.org/anthology/W95-0107", |
|
} |
|
``` |