File size: 3,838 Bytes
9941bd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import spacy
import numpy as np
from transformers import Pipeline
class SRLPipeline(Pipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
spacy.prefer_gpu()
if not spacy.util.is_package("pt_core_news_sm"):
spacy.cli.download("pt_core_news_sm")
self.nlp = spacy.load("pt_core_news_sm")
def align_labels_with_tokens(self, tokenized_inputs, all_labels):
results = []
for i, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(batch_index=i)
type_ids = tokenized_inputs[i].type_ids
num_special_tokens = len(
[type_id for type_id in type_ids if type_id != 0])
if num_special_tokens > 0:
word_ids = word_ids[:-num_special_tokens]
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word:
# Start of a new word!
current_word = word_id
label = -100 if word_id is None else labels[word_id]
new_labels.append(label)
elif word_id is None:
# Special token
new_labels.append(-100)
else:
"""
# Same word as previous token
label = labels[word_id]
# If the label is B-XXX we change it to I-XXX
if label % 2 == 1:
label += 1
"""
new_labels.append(-100)
results.append(new_labels)
tokenized_inputs['labels'] = results
return tokenized_inputs
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
if "verb" in kwargs:
preprocess_kwargs["verb"] = kwargs["verb"]
return preprocess_kwargs, {}, {}
def preprocess(self, text):
self.text = text
doc = self.nlp(text.strip())
self.label_names = self.model.config.id2label
# Extract list with verbs from the text
self.verbs = [token.text for token in doc if token.pos_ == "VERB"]
results = []
tokenized_input = [token.text for token in doc]
raw_labels = [0] * len(tokenized_input)
for verb in self.verbs:
tokenized_results = self.tokenizer(
tokenized_input, [verb], truncation=True,
is_split_into_words=True,
return_tensors="pt", max_length=self.model.config.max_position_embeddings)
tokenized_results = self.align_labels_with_tokens(
tokenized_inputs=tokenized_results, all_labels=[raw_labels])
self.labels = tokenized_results["labels"]
# Remove labels temporarily to avoid conflicts in the forward pass
tokenized_results.pop("labels")
results.append(tokenized_results)
return results
def _forward(self, batch_inputs):
results = []
for entry in batch_inputs:
results.append(self.model(**entry))
return results
def postprocess(self, batch_outputs):
outputs = []
for i, entry in enumerate(batch_outputs):
logits = entry.logits
predictions = np.argmax(logits, axis=-1).squeeze().tolist()
true_predictions = []
for prediction, label in zip(predictions, self.labels[0]):
if label != -100:
true_predictions.append(self.label_names[prediction])
outputs.append({
"tokens": self.text.split(),
"predictions": true_predictions,
"verb": self.verbs[i]
})
return outputs
|