from spacy.tokens import Doc, Span from spacy import Language import re @Language.component("year_matcher") def year_matcher(doc: Doc): entities = list(doc.ents) years = [] for token in doc: if token.ent_type == 0: year = re.search(r'\d{4}', token.text) if year: years.append(Span(doc, token.i, token.i + 1, 'YEAR')) if len(years) > 0: entities.append(years[-1]) doc.set_ents(entities) return doc