arubenruben
commited on
Commit
•
b06c1b9
1
Parent(s):
f356772
Update deploy_pipeline.py
Browse files- deploy_pipeline.py +10 -12
deploy_pipeline.py
CHANGED
@@ -13,15 +13,14 @@ class TokenizeAndAlignLabelsStep():
|
|
13 |
|
14 |
# Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
|
15 |
def tokenize_and_align_labels(self, examples, tokenizer):
|
16 |
-
|
17 |
-
tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128)
|
18 |
|
19 |
# Map tokens to their respective word.
|
20 |
word_ids = tokenized_inputs.word_ids()
|
21 |
|
22 |
previous_word_idx = None
|
23 |
-
|
24 |
-
|
25 |
labels_mask = []
|
26 |
|
27 |
for word_idx in word_ids: # Set the special tokens to -100.
|
@@ -35,28 +34,25 @@ class TokenizeAndAlignLabelsStep():
|
|
35 |
|
36 |
previous_word_idx = word_idx
|
37 |
|
38 |
-
tokenized_inputs["tokens"] = tokenizer.decode(tokenized_inputs["input_ids"], skip_special_tokens=True)
|
39 |
tokenized_inputs["labels_mask"] = labels_mask
|
40 |
|
41 |
return tokenized_inputs
|
42 |
|
43 |
|
44 |
|
45 |
-
|
46 |
class BERT_CRF_Pipeline(Pipeline):
|
47 |
|
48 |
def _sanitize_parameters(self, **kwargs):
|
49 |
return {}, {}, {}
|
50 |
|
51 |
-
def preprocess(self,
|
|
|
52 |
|
53 |
tokenizer = AutoTokenizer.from_pretrained(
|
54 |
"neuralmind/bert-base-portuguese-cased", do_lower_case=False)
|
|
|
|
|
55 |
|
56 |
-
TokenizeAndAlignLabelsStep().tokenize_and_align_labels(
|
57 |
-
examples=text, tokenizer=tokenizer)
|
58 |
-
|
59 |
-
return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer)
|
60 |
|
61 |
def _forward(self, tokenizer_results):
|
62 |
|
@@ -79,13 +75,15 @@ class BERT_CRF_Pipeline(Pipeline):
|
|
79 |
return outputs
|
80 |
|
81 |
def postprocess(self, model_outputs):
|
|
|
82 |
# From Ner_tags to Ner_labels
|
83 |
for i, label in enumerate(model_outputs[0]):
|
84 |
model_outputs[0][i] = self.model.config.id2label[label]
|
85 |
-
|
86 |
return model_outputs[0]
|
87 |
|
88 |
|
|
|
89 |
def main():
|
90 |
|
91 |
PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-Conll2003-pipeline",
|
|
|
13 |
|
14 |
# Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
|
15 |
def tokenize_and_align_labels(self, examples, tokenizer):
|
16 |
+
|
17 |
+
tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128, is_split_into_words=True)
|
18 |
|
19 |
# Map tokens to their respective word.
|
20 |
word_ids = tokenized_inputs.word_ids()
|
21 |
|
22 |
previous_word_idx = None
|
23 |
+
|
|
|
24 |
labels_mask = []
|
25 |
|
26 |
for word_idx in word_ids: # Set the special tokens to -100.
|
|
|
34 |
|
35 |
previous_word_idx = word_idx
|
36 |
|
|
|
37 |
tokenized_inputs["labels_mask"] = labels_mask
|
38 |
|
39 |
return tokenized_inputs
|
40 |
|
41 |
|
42 |
|
|
|
43 |
class BERT_CRF_Pipeline(Pipeline):
|
44 |
|
45 |
def _sanitize_parameters(self, **kwargs):
|
46 |
return {}, {}, {}
|
47 |
|
48 |
+
def preprocess(self, inputs):
|
49 |
+
tokens = inputs['tokens']
|
50 |
|
51 |
tokenizer = AutoTokenizer.from_pretrained(
|
52 |
"neuralmind/bert-base-portuguese-cased", do_lower_case=False)
|
53 |
+
|
54 |
+
return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=tokens, tokenizer=tokenizer)
|
55 |
|
|
|
|
|
|
|
|
|
56 |
|
57 |
def _forward(self, tokenizer_results):
|
58 |
|
|
|
75 |
return outputs
|
76 |
|
77 |
def postprocess(self, model_outputs):
|
78 |
+
|
79 |
# From Ner_tags to Ner_labels
|
80 |
for i, label in enumerate(model_outputs[0]):
|
81 |
model_outputs[0][i] = self.model.config.id2label[label]
|
82 |
+
|
83 |
return model_outputs[0]
|
84 |
|
85 |
|
86 |
+
|
87 |
def main():
|
88 |
|
89 |
PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-Conll2003-pipeline",
|