Spaces:
Running
Running
preposições
Browse files- app.py +81 -5
- models/lusa_prepo/config.json +54 -0
- models/lusa_prepo/model.safetensors +3 -0
- models/lusa_prepo/special_tokens_map.json +7 -0
- models/lusa_prepo/tokenizer.json +0 -0
- models/lusa_prepo/tokenizer_config.json +57 -0
- models/lusa_prepo/training_args.bin +3 -0
- models/lusa_prepo/vocab.txt +0 -0
- requirements.txt +3 -1
app.py
CHANGED
@@ -4,18 +4,84 @@ from annotated_text import annotated_text
|
|
4 |
import torch
|
5 |
from transformers import pipeline
|
6 |
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
7 |
-
|
8 |
import json
|
9 |
|
10 |
st.set_page_config(layout="wide")
|
11 |
|
12 |
-
model = AutoModelForTokenClassification.from_pretrained("./models/
|
13 |
|
14 |
-
tokenizer = AutoTokenizer.from_pretrained("./models/
|
15 |
tagger = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='first') #aggregation_strategy='max'
|
16 |
|
17 |
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def aggregate_subwords(input_tokens, labels):
|
20 |
new_inputs = []
|
21 |
new_labels = []
|
@@ -40,7 +106,10 @@ def aggregate_subwords(input_tokens, labels):
|
|
40 |
|
41 |
def annotateTriggers(line):
|
42 |
line = line.strip()
|
43 |
-
|
|
|
|
|
|
|
44 |
input_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
|
45 |
|
46 |
with torch.no_grad():
|
@@ -49,6 +118,13 @@ def annotateTriggers(line):
|
|
49 |
predictions = torch.argmax(logits, dim=2)
|
50 |
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
|
51 |
input_tokens, predicted_token_class = aggregate_subwords(input_tokens,predicted_token_class)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
token_labels = []
|
53 |
current_entity = ''
|
54 |
for i, label in enumerate(predicted_token_class):
|
@@ -66,7 +142,7 @@ def annotateTriggers(line):
|
|
66 |
token_labels[-1] = (token_labels[-1][0] + f" {token}", 'I', current_entity)
|
67 |
else:
|
68 |
raise ValueError(f"Invalid label: {label}")
|
69 |
-
return token_labels
|
70 |
|
71 |
|
72 |
|
|
|
4 |
import torch
|
5 |
from transformers import pipeline
|
6 |
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
7 |
+
import spacy
|
8 |
import json
|
9 |
|
10 |
st.set_page_config(layout="wide")
|
11 |
|
12 |
+
model = AutoModelForTokenClassification.from_pretrained("./models/lusa_prepo", use_safetensors=True)
|
13 |
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("./models/lusa_prepo", model_max_length=512)
|
15 |
tagger = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='first') #aggregation_strategy='max'
|
16 |
|
17 |
|
18 |
|
19 |
+
from spacy.matcher import PhraseMatcher
|
20 |
+
nlp = spacy.load("en_core_web_sm")
|
21 |
+
|
22 |
+
|
23 |
+
tokenization_contractions = {
|
24 |
+
"no": ["n", "o"],
|
25 |
+
"na": ["n", "a"],
|
26 |
+
"nos": ["n", "os"],
|
27 |
+
"nas": ["n", "as"],
|
28 |
+
"ao": ["a", "o"],
|
29 |
+
# "à": ["a", "a"],
|
30 |
+
"aos": ["a", "os"],
|
31 |
+
# "às": ["a", "as"],
|
32 |
+
"do": ["d", "o"],
|
33 |
+
"da": ["d", "a"],
|
34 |
+
"dos": ["d", "os"],
|
35 |
+
"das": ["d", "as"],
|
36 |
+
"pelo": ["pel", "o"],
|
37 |
+
"pela": ["pel", "a"],
|
38 |
+
"pelos": ["pel", "os"],
|
39 |
+
"pelas": ["pel", "as"],
|
40 |
+
"dum": ["d", "um"],
|
41 |
+
"duma": ["d", "uma"],
|
42 |
+
"duns": ["d", "uns"],
|
43 |
+
"dumas": ["d", "umas"],
|
44 |
+
"num": ["n", "um"],
|
45 |
+
"numa": ["n", "uma"],
|
46 |
+
"nuns": ["n", "uns"],
|
47 |
+
"numas": ["n", "umas"],
|
48 |
+
"dele": ["d", "ele"],
|
49 |
+
"dela": ["d", "ela"],
|
50 |
+
"deles": ["d", "eles"],
|
51 |
+
"delas": ["d", "elas"],
|
52 |
+
"deste": ["d", "este"],
|
53 |
+
"desta": ["d", "esta"],
|
54 |
+
"destes": ["d", "estes"],
|
55 |
+
"destas": ["d", "estas"],
|
56 |
+
"desse": ["d", "esse"],
|
57 |
+
"dessa": ["d", "essa"],
|
58 |
+
"desses": ["d", "esses"],
|
59 |
+
"dessas": ["d", "essas"],
|
60 |
+
"daquele": ["d", "aquele"],
|
61 |
+
"daquela": ["d", "aquela"],
|
62 |
+
"daqueles": ["d", "aqueles"],
|
63 |
+
"daquelas": ["d", "aquelas"],
|
64 |
+
}
|
65 |
+
|
66 |
+
|
67 |
+
def tokenize_contractions(doc, tokenization_contractions):
|
68 |
+
words = tokenization_contractions.keys() # Example: words to be split
|
69 |
+
splits = tokenization_contractions
|
70 |
+
matcher = PhraseMatcher(nlp.vocab)
|
71 |
+
patterns = [nlp.make_doc(text) for text in words]
|
72 |
+
matcher.add("Terminology", None, *patterns)
|
73 |
+
matches = matcher(doc)
|
74 |
+
|
75 |
+
with doc.retokenize() as retokenizer:
|
76 |
+
for match_id, start, end in matches:
|
77 |
+
heads = [(doc[start],1), doc[start]]
|
78 |
+
attrs = {"POS": ["ADP", "DET"], "DEP": ["pobj", "compound"]}
|
79 |
+
orths= splits[doc[start:end].text]
|
80 |
+
retokenizer.split(doc[start], orths=orths, heads=heads, attrs=attrs)
|
81 |
+
return doc
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
def aggregate_subwords(input_tokens, labels):
|
86 |
new_inputs = []
|
87 |
new_labels = []
|
|
|
106 |
|
107 |
def annotateTriggers(line):
|
108 |
line = line.strip()
|
109 |
+
doc = nlp(line)
|
110 |
+
doc = tokenize_contractions(doc, tokenization_contractions)
|
111 |
+
tokens = [token.text for token in doc]
|
112 |
+
inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt")
|
113 |
input_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
|
114 |
|
115 |
with torch.no_grad():
|
|
|
118 |
predictions = torch.argmax(logits, dim=2)
|
119 |
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
|
120 |
input_tokens, predicted_token_class = aggregate_subwords(input_tokens,predicted_token_class)
|
121 |
+
|
122 |
+
|
123 |
+
input_tokens = input_tokens[1:-1]
|
124 |
+
predicted_token_class = predicted_token_class[1:-1]
|
125 |
+
print(input_tokens)
|
126 |
+
print(predicted_token_class)
|
127 |
+
print(len(input_tokens), len(predicted_token_class))
|
128 |
token_labels = []
|
129 |
current_entity = ''
|
130 |
for i, label in enumerate(predicted_token_class):
|
|
|
142 |
token_labels[-1] = (token_labels[-1][0] + f" {token}", 'I', current_entity)
|
143 |
else:
|
144 |
raise ValueError(f"Invalid label: {label}")
|
145 |
+
return token_labels
|
146 |
|
147 |
|
148 |
|
models/lusa_prepo/config.json
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "neuralmind/bert-base-portuguese-cased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForTokenClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"directionality": "bidi",
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "I-Time",
|
14 |
+
"1": "B-Event",
|
15 |
+
"2": "B-Spatial_Relation",
|
16 |
+
"3": "I-Event",
|
17 |
+
"4": "I-Participant",
|
18 |
+
"5": "I-Spatial_Relation",
|
19 |
+
"6": "B-Participant",
|
20 |
+
"7": "B-Time",
|
21 |
+
"8": "O"
|
22 |
+
},
|
23 |
+
"initializer_range": 0.02,
|
24 |
+
"intermediate_size": 3072,
|
25 |
+
"label2id": {
|
26 |
+
"B-Event": 1,
|
27 |
+
"B-Participant": 6,
|
28 |
+
"B-Spatial_Relation": 2,
|
29 |
+
"B-Time": 7,
|
30 |
+
"I-Event": 3,
|
31 |
+
"I-Participant": 4,
|
32 |
+
"I-Spatial_Relation": 5,
|
33 |
+
"I-Time": 0,
|
34 |
+
"O": 8
|
35 |
+
},
|
36 |
+
"layer_norm_eps": 1e-12,
|
37 |
+
"max_position_embeddings": 512,
|
38 |
+
"model_type": "bert",
|
39 |
+
"num_attention_heads": 12,
|
40 |
+
"num_hidden_layers": 12,
|
41 |
+
"output_past": true,
|
42 |
+
"pad_token_id": 0,
|
43 |
+
"pooler_fc_size": 768,
|
44 |
+
"pooler_num_attention_heads": 12,
|
45 |
+
"pooler_num_fc_layers": 3,
|
46 |
+
"pooler_size_per_head": 128,
|
47 |
+
"pooler_type": "first_token_transform",
|
48 |
+
"position_embedding_type": "absolute",
|
49 |
+
"torch_dtype": "float32",
|
50 |
+
"transformers_version": "4.41.0",
|
51 |
+
"type_vocab_size": 2,
|
52 |
+
"use_cache": true,
|
53 |
+
"vocab_size": 29794
|
54 |
+
}
|
models/lusa_prepo/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d0c35670da238b01532422ca363069823ab20899081d20eb1a47b587b901c6e
|
3 |
+
size 433381196
|
models/lusa_prepo/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
models/lusa_prepo/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/lusa_prepo/tokenizer_config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": false,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 1000000000000000019884624838656,
|
50 |
+
"never_split": null,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"strip_accents": null,
|
54 |
+
"tokenize_chinese_chars": true,
|
55 |
+
"tokenizer_class": "BertTokenizer",
|
56 |
+
"unk_token": "[UNK]"
|
57 |
+
}
|
models/lusa_prepo/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23db42c77af1fa375fde0979687c77ed40944a0934720cbb872e7c08194902eb
|
3 |
+
size 5112
|
models/lusa_prepo/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -5,4 +5,6 @@ tqdm==4.64.0
|
|
5 |
numpy==1.22.3
|
6 |
pysbd==0.3.4
|
7 |
altair==4.2.2
|
8 |
-
streamlit==1.19.0
|
|
|
|
|
|
5 |
numpy==1.22.3
|
6 |
pysbd==0.3.4
|
7 |
altair==4.2.2
|
8 |
+
streamlit==1.19.0
|
9 |
+
spacy==3.4.4
|
10 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
|