Spaces:

lfcc
/

Event-Identifier

Running

App Files Files Community

lfcc commited on Jun 5

Commit

f2ae9f0

•

1 Parent(s): c82cb47

preposições

Browse files

Files changed (9) hide show

app.py +81 -5
models/lusa_prepo/config.json +54 -0
models/lusa_prepo/model.safetensors +3 -0
models/lusa_prepo/special_tokens_map.json +7 -0
models/lusa_prepo/tokenizer.json +0 -0
models/lusa_prepo/tokenizer_config.json +57 -0
models/lusa_prepo/training_args.bin +3 -0
models/lusa_prepo/vocab.txt +0 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -4,18 +4,84 @@ from annotated_text import annotated_text
 import torch
 from transformers import pipeline
 from transformers import AutoModelForTokenClassification, AutoTokenizer
 import json
 st.set_page_config(layout="wide")
-model = AutoModelForTokenClassification.from_pretrained("./models/bert_crf", use_safetensors=True)
-tokenizer = AutoTokenizer.from_pretrained("./models/bert_crf", model_max_length=512)
 tagger = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='first') #aggregation_strategy='max'
 def aggregate_subwords(input_tokens, labels):
     new_inputs = []
     new_labels = []
@@ -40,7 +106,10 @@ def aggregate_subwords(input_tokens, labels):
 def annotateTriggers(line):
     line = line.strip()
-    inputs = tokenizer(line, return_tensors="pt")
     input_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
     with torch.no_grad():
@@ -49,6 +118,13 @@ def annotateTriggers(line):
     predictions = torch.argmax(logits, dim=2)
     predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
     input_tokens, predicted_token_class =  aggregate_subwords(input_tokens,predicted_token_class)
     token_labels = []
     current_entity = ''
     for i, label in enumerate(predicted_token_class):
@@ -66,7 +142,7 @@ def annotateTriggers(line):
             token_labels[-1] = (token_labels[-1][0] + f" {token}", 'I', current_entity)
         else:
             raise ValueError(f"Invalid label: {label}")
-    return token_labels[1:-1]

 import torch
 from transformers import pipeline
 from transformers import AutoModelForTokenClassification, AutoTokenizer
+import spacy
 import json
 st.set_page_config(layout="wide")
+model = AutoModelForTokenClassification.from_pretrained("./models/lusa_prepo", use_safetensors=True)
+tokenizer = AutoTokenizer.from_pretrained("./models/lusa_prepo", model_max_length=512)
 tagger = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='first') #aggregation_strategy='max'
+from spacy.matcher import PhraseMatcher
+nlp = spacy.load("en_core_web_sm")
+tokenization_contractions = {
+    "no": ["n", "o"],
+    "na": ["n", "a"],
+    "nos": ["n", "os"],
+    "nas": ["n", "as"],
+    "ao": ["a", "o"],
+#    "à": ["a", "a"],
+    "aos": ["a", "os"],
+ #   "às": ["a", "as"],
+    "do": ["d", "o"],
+    "da": ["d", "a"],
+    "dos": ["d", "os"],
+    "das": ["d", "as"],
+    "pelo": ["pel", "o"],
+    "pela": ["pel", "a"],
+    "pelos": ["pel", "os"],
+    "pelas": ["pel", "as"],
+    "dum": ["d", "um"],
+    "duma": ["d", "uma"],
+    "duns": ["d", "uns"],
+    "dumas": ["d", "umas"],
+    "num": ["n", "um"],
+    "numa": ["n", "uma"],
+    "nuns": ["n", "uns"],
+    "numas": ["n", "umas"],
+    "dele": ["d", "ele"],
+    "dela": ["d", "ela"],
+    "deles": ["d", "eles"],
+    "delas": ["d", "elas"],
+    "deste": ["d", "este"],
+    "desta": ["d", "esta"],
+    "destes": ["d", "estes"],
+    "destas": ["d", "estas"],
+    "desse": ["d", "esse"],
+    "dessa": ["d", "essa"],
+    "desses": ["d", "esses"],
+    "dessas": ["d", "essas"],
+    "daquele": ["d", "aquele"],
+    "daquela": ["d", "aquela"],
+    "daqueles": ["d", "aqueles"],
+    "daquelas": ["d", "aquelas"],
+}
+def tokenize_contractions(doc, tokenization_contractions):
+    words = tokenization_contractions.keys() # Example: words to be split
+    splits = tokenization_contractions
+    matcher = PhraseMatcher(nlp.vocab)
+    patterns = [nlp.make_doc(text) for text in words]
+    matcher.add("Terminology", None, *patterns)
+    matches = matcher(doc)
+    with doc.retokenize() as retokenizer:
+        for match_id, start, end in matches:
+            heads = [(doc[start],1), doc[start]]
+            attrs = {"POS": ["ADP", "DET"], "DEP": ["pobj", "compound"]}
+            orths= splits[doc[start:end].text]
+            retokenizer.split(doc[start], orths=orths, heads=heads, attrs=attrs)
+    return doc
 def aggregate_subwords(input_tokens, labels):
     new_inputs = []
     new_labels = []
 def annotateTriggers(line):
     line = line.strip()
+    doc = nlp(line)
+    doc = tokenize_contractions(doc, tokenization_contractions)
+    tokens = [token.text for token in doc]
+    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt")
     input_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
     with torch.no_grad():
     predictions = torch.argmax(logits, dim=2)
     predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
     input_tokens, predicted_token_class =  aggregate_subwords(input_tokens,predicted_token_class)
+    input_tokens = input_tokens[1:-1]
+    predicted_token_class = predicted_token_class[1:-1]
+    print(input_tokens)
+    print(predicted_token_class)
+    print(len(input_tokens), len(predicted_token_class))
     token_labels = []
     current_entity = ''
     for i, label in enumerate(predicted_token_class):
             token_labels[-1] = (token_labels[-1][0] + f" {token}", 'I', current_entity)
         else:
             raise ValueError(f"Invalid label: {label}")
+    return token_labels

models/lusa_prepo/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "I-Time",
+    "1": "B-Event",
+    "2": "B-Spatial_Relation",
+    "3": "I-Event",
+    "4": "I-Participant",
+    "5": "I-Spatial_Relation",
+    "6": "B-Participant",
+    "7": "B-Time",
+    "8": "O"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "B-Event": 1,
+    "B-Participant": 6,
+    "B-Spatial_Relation": 2,
+    "B-Time": 7,
+    "I-Event": 3,
+    "I-Participant": 4,
+    "I-Spatial_Relation": 5,
+    "I-Time": 0,
+    "O": 8
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 29794
+}

models/lusa_prepo/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d0c35670da238b01532422ca363069823ab20899081d20eb1a47b587b901c6e
+size 433381196

models/lusa_prepo/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

models/lusa_prepo/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/lusa_prepo/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

models/lusa_prepo/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23db42c77af1fa375fde0979687c77ed40944a0934720cbb872e7c08194902eb
+size 5112

models/lusa_prepo/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -5,4 +5,6 @@ tqdm==4.64.0
 numpy==1.22.3
 pysbd==0.3.4
 altair==4.2.2
-streamlit==1.19.0

 numpy==1.22.3
 pysbd==0.3.4
 altair==4.2.2
+streamlit==1.19.0
+spacy==3.4.4
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl