File size: 479 Bytes
90bf6af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
from spacy.tokens import Doc, Span
from spacy import Language
import re
@Language.component("year_matcher")
def year_matcher(doc: Doc):
entities = list(doc.ents)
years = []
for token in doc:
if token.ent_type == 0:
year = re.search(r'\d{4}', token.text)
if year:
years.append(Span(doc, token.i, token.i + 1, 'YEAR'))
if len(years) > 0:
entities.append(years[-1])
doc.set_ents(entities)
return doc
|