Upload span_lableing_operators.py with huggingface_hub
Browse files- span_lableing_operators.py +80 -0
span_lableing_operators.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict, List, Optional
|
2 |
+
|
3 |
+
from .operator import StreamInstanceOperator
|
4 |
+
|
5 |
+
|
6 |
+
class IobExtractor(StreamInstanceOperator):
|
7 |
+
"""A class designed to extract entities from sequences of text using the Inside-Outside-Beginning (IOB) tagging convention. It identifies entities based on IOB tags and categorizes them into predefined labels such as Person, Organization, and Location.
|
8 |
+
|
9 |
+
Attributes:
|
10 |
+
labels (List[str]): A list of entity type labels, e.g., ["Person", "Organization", "Location"].
|
11 |
+
begin_labels (List[str]): A list of labels indicating the beginning of an entity, e.g., ["B-PER", "B-ORG", "B-LOC"].
|
12 |
+
inside_labels (List[str]): A list of labels indicating the continuation of an entity, e.g., ["I-PER", "I-ORG", "I-LOC"].
|
13 |
+
outside_label (str): The label indicating tokens outside of any entity, typically "O".
|
14 |
+
|
15 |
+
The extraction process identifies spans of text corresponding to entities and labels them according to their entity type. Each span is annotated with a start and end character offset, the entity text, and the corresponding label.
|
16 |
+
|
17 |
+
Example of instantiation and usage:
|
18 |
+
```python
|
19 |
+
operator = IobExtractor(
|
20 |
+
labels=["Person", "Organization", "Location"],
|
21 |
+
begin_labels=["B-PER", "B-ORG", "B-LOC"],
|
22 |
+
inside_labels=["I-PER", "I-ORG", "I-LOC"],
|
23 |
+
outside_label="O",
|
24 |
+
)
|
25 |
+
|
26 |
+
instance = {
|
27 |
+
"labels": ["B-PER", "I-PER", "O", "B-ORG", "I-ORG"],
|
28 |
+
"tokens": ["John", "Doe", "works", "at", "OpenAI"]
|
29 |
+
}
|
30 |
+
processed_instance = operator.process(instance)
|
31 |
+
print(processed_instance["spans"])
|
32 |
+
# Output: [{'start': 0, 'end': 8, 'text': 'John Doe', 'label': 'Person'}, ...]
|
33 |
+
```
|
34 |
+
|
35 |
+
For more details on the IOB tagging convention, see: https://en.wikipedia.org/wiki/Inside-outside-beginning_(tagging)
|
36 |
+
|
37 |
+
"""
|
38 |
+
|
39 |
+
labels: List[str]
|
40 |
+
begin_labels: List[str]
|
41 |
+
inside_labels: List[str]
|
42 |
+
outside_label: int
|
43 |
+
|
44 |
+
def process(
|
45 |
+
self, instance: Dict[str, Any], stream_name: Optional[str] = None
|
46 |
+
) -> Dict[str, Any]:
|
47 |
+
labels = instance["labels"]
|
48 |
+
tokens = instance["tokens"]
|
49 |
+
text = instance["text"]
|
50 |
+
|
51 |
+
spans = []
|
52 |
+
current_pos = 0
|
53 |
+
end_pos = 0
|
54 |
+
|
55 |
+
for label, token in zip(labels, tokens):
|
56 |
+
token_pos = text.find(token, current_pos)
|
57 |
+
if token_pos == -1:
|
58 |
+
raise ValueError(
|
59 |
+
f"Token '{token}' not found in text '{text}' starting from position {current_pos}"
|
60 |
+
)
|
61 |
+
|
62 |
+
end_pos = token_pos + len(token)
|
63 |
+
|
64 |
+
if label in self.begin_labels:
|
65 |
+
span = {
|
66 |
+
"start": token_pos,
|
67 |
+
"label": self.labels[self.begin_labels.index(label)],
|
68 |
+
"end": end_pos,
|
69 |
+
}
|
70 |
+
spans.append(span)
|
71 |
+
elif label in self.inside_labels and spans:
|
72 |
+
spans[-1]["end"] = end_pos
|
73 |
+
|
74 |
+
current_pos = end_pos
|
75 |
+
|
76 |
+
for span in spans:
|
77 |
+
span["text"] = text[span["start"] : span["end"]]
|
78 |
+
|
79 |
+
instance["spans"] = spans
|
80 |
+
return instance
|