|
--- |
|
license: mit |
|
language: |
|
- ru |
|
--- |
|
|
|
RUPunct_big - самая большая модель из семейства RUPunct. Подходит для большинства задач. |
|
|
|
Код инференса: |
|
```py |
|
from transformers import pipeline |
|
from transformers import AutoTokenizer |
|
|
|
pt = "RUPunct/RUPunct_big" |
|
|
|
tk = AutoTokenizer.from_pretrained(pt, strip_accents=False, add_prefix_space=True) |
|
classifier = pipeline("ner", model=pt, tokenizer=tk, aggregation_strategy="first") |
|
|
|
|
|
def process_token(token, label): |
|
if label == "LOWER_O": |
|
return token |
|
if label == "LOWER_PERIOD": |
|
return token + "." |
|
if label == "LOWER_COMMA": |
|
return token + "," |
|
if label == "LOWER_QUESTION": |
|
return token + "?" |
|
if label == "LOWER_TIRE": |
|
return token + "—" |
|
if label == "LOWER_DVOETOCHIE": |
|
return token + ":" |
|
if label == "LOWER_VOSKL": |
|
return token + "!" |
|
if label == "LOWER_PERIODCOMMA": |
|
return token + ";" |
|
if label == "LOWER_DEFIS": |
|
return token + "-" |
|
if label == "LOWER_MNOGOTOCHIE": |
|
return token + "..." |
|
if label == "LOWER_QUESTIONVOSKL": |
|
return token + "?!" |
|
if label == "UPPER_O": |
|
return token.capitalize() |
|
if label == "UPPER_PERIOD": |
|
return token.capitalize() + "." |
|
if label == "UPPER_COMMA": |
|
return token.capitalize() + "," |
|
if label == "UPPER_QUESTION": |
|
return token.capitalize() + "?" |
|
if label == "UPPER_TIRE": |
|
return token.capitalize() + " —" |
|
if label == "UPPER_DVOETOCHIE": |
|
return token.capitalize() + ":" |
|
if label == "UPPER_VOSKL": |
|
return token.capitalize() + "!" |
|
if label == "UPPER_PERIODCOMMA": |
|
return token.capitalize() + ";" |
|
if label == "UPPER_DEFIS": |
|
return token.capitalize() + "-" |
|
if label == "UPPER_MNOGOTOCHIE": |
|
return token.capitalize() + "..." |
|
if label == "UPPER_QUESTIONVOSKL": |
|
return token.capitalize() + "?!" |
|
if label == "UPPER_TOTAL_O": |
|
return token.upper() |
|
if label == "UPPER_TOTAL_PERIOD": |
|
return token.upper() + "." |
|
if label == "UPPER_TOTAL_COMMA": |
|
return token.upper() + "," |
|
if label == "UPPER_TOTAL_QUESTION": |
|
return token.upper() + "?" |
|
if label == "UPPER_TOTAL_TIRE": |
|
return token.upper() + " —" |
|
if label == "UPPER_TOTAL_DVOETOCHIE": |
|
return token.upper() + ":" |
|
if label == "UPPER_TOTAL_VOSKL": |
|
return token.upper() + "!" |
|
if label == "UPPER_TOTAL_PERIODCOMMA": |
|
return token.upper() + ";" |
|
if label == "UPPER_TOTAL_DEFIS": |
|
return token.upper() + "-" |
|
if label == "UPPER_TOTAL_MNOGOTOCHIE": |
|
return token.upper() + "..." |
|
if label == "UPPER_TOTAL_QUESTIONVOSKL": |
|
return token.upper() + "?!" |
|
|
|
while 1: |
|
input_text = input(":> ") |
|
preds = classifier(input_text) |
|
output = "" |
|
for item in preds: |
|
output += " " + process_token(item['word'].strip(), item['entity_group']) |
|
print(">>>", output) |
|
``` |