es_metaextract_umsa_v2 / year_matcher_component.py

Initial release

90bf6af verified about 1 year ago

479 Bytes

	from spacy.tokens import Doc, Span
	from spacy import Language
	import re


	@Language.component("year_matcher")
	def year_matcher(doc: Doc):
	entities = list(doc.ents)
	years = []
	for token in doc:
	if token.ent_type == 0:
	year = re.search(r'\d{4}', token.text)
	if year:
	years.append(Span(doc, token.i, token.i + 1, 'YEAR'))
	if len(years) > 0:
	entities.append(years[-1])
	doc.set_ents(entities)
	return doc