es_metaextract_umsa_v2 / year_matcher_component.py
riturralde's picture
Initial release
90bf6af verified
raw
history blame
479 Bytes
from spacy.tokens import Doc, Span
from spacy import Language
import re
@Language.component("year_matcher")
def year_matcher(doc: Doc):
entities = list(doc.ents)
years = []
for token in doc:
if token.ent_type == 0:
year = re.search(r'\d{4}', token.text)
if year:
years.append(Span(doc, token.i, token.i + 1, 'YEAR'))
if len(years) > 0:
entities.append(years[-1])
doc.set_ents(entities)
return doc