|
# Model description |
|
|
|
- Morphosyntactic analyzer: Stanza |
|
- Tagset: NKJP |
|
- Embedding vectors: Fasttext (wiki) |
|
- Dataset: NLPrePL-NKJP-fair-by-name (https://huggingface.co/datasets/ipipan/nlprepl) |
|
|
|
# How to use |
|
|
|
## Clone |
|
|
|
``` |
|
git clone [email protected]:ipipan/nlpre_stanza_nkjp_fasttext_nkjp-by-name |
|
``` |
|
|
|
## Load model |
|
|
|
``` |
|
import stanza |
|
lang = 'pl' |
|
model_name = 'nlpre_stanza_nkjp_fasttext_nkjp-by-name' |
|
prefix = 'nkjpbyname_nkjp' |
|
config = \ |
|
{ |
|
# Comma-separated list of processors to use |
|
'processors': 'tokenize,mwt,pos,lemma', |
|
# Language code for the language to build the Pipeline in |
|
'lang': lang, |
|
# Processor-specific arguments are set with keys "{processor_name}_{argument_name}" |
|
# You only need model paths if you have a specific model outside of stanza_resources |
|
'tokenize_model_path': os.path.join(model_name, f'{lang}_{prefix}_tokenizer.pt'), |
|
'mwt_model_path': os.path.join(model_name, f'{lang}_{prefix}_mwt_expander.pt'), |
|
'pos_model_path': os.path.join(model_name, f'{lang}_{prefix}_tagger.pt'), |
|
'pos_pretrain_path': os.path.join(model_name, f'{lang}_{prefix}.pretrain.pt'), |
|
'lemma_model_path': os.path.join(model_name, f'{lang}_{prefix}_lemmatizer.pt'), |
|
# Use pretokenized text as input and disable tokenization |
|
'tokenize_pretokenized': True |
|
} |
|
model = stanza.Pipeline(**config) |
|
|