|
import spacy |
|
import re |
|
from collections import Counter |
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
def anonymize_text_including_proper_nouns_and_addresses(text): |
|
""" |
|
Anonymizes text by replacing occurrences of specific PII types and proper nouns, including potential physical addresses, |
|
with unique placeholders. It utilizes both regex for specific PII patterns and spaCy for NER to identify proper nouns. |
|
Parameters: |
|
- text: The input text string to be anonymized. |
|
- pii_config: A dictionary specifying the PII types to anonymize, where each key is a PII type (e.g., 'phone_number') |
|
and each value is a tuple containing a regex pattern to identify the PII and a placeholder string for its anonymization. |
|
Returns: |
|
- anonymized_text: The text after PII and proper nouns have been anonymized. |
|
- lookup_table: A dictionary mapping the anonymized placeholders back to the original text strings, enabling potential restoration. |
|
""" |
|
anonymized_text = text |
|
lookup_table = {} |
|
counters = Counter() |
|
pii_config = { |
|
'phone_number': (r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', 'PHONE__'), |
|
'text_address': (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'EMAIL__'), |
|
'social_security_number': (r'\b\d{3}-\d{2}-\d{4}\b', 'SSN__'), |
|
'website': (r'\b(?:http://|https://)?(?:www\.)?[a-zA-Z0-9./]+\.[a-z]{2,}\b', 'WEBSITE__') |
|
} |
|
|
|
|
|
for pii_type, (pattern, placeholder) in pii_config.items(): |
|
for match in re.finditer(pattern, text): |
|
original = match.group() |
|
counters[pii_type] += 1 |
|
placeholder_with_counter = f"{placeholder}{counters[pii_type]}" |
|
anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1) |
|
lookup_table[placeholder_with_counter] = original |
|
|
|
|
|
doc = nlp(anonymized_text) |
|
|
|
ents_and_tokens = sorted(list(doc.ents) + [token for token in doc if token.pos_ == "PROPN"], key=lambda x: -len(x.text)) |
|
|
|
for ent_or_token in ents_and_tokens: |
|
if isinstance(ent_or_token, spacy.tokens.Token): |
|
|
|
label = 'PROPN' |
|
else: |
|
|
|
label = ent_or_token.label_ |
|
|
|
if label in ['ORG', 'PERSON', 'GPE', 'LOC', 'FAC', 'PROPN']: |
|
original = ent_or_token.text |
|
counters[label] += 1 |
|
placeholder_with_counter = f"{label}__{counters[label]}" |
|
if original in anonymized_text: |
|
anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1) |
|
lookup_table[placeholder_with_counter] = original |
|
|
|
return anonymized_text |
|
|
|
sample_text_1 = """ |
|
Contact John Doe at 123-456-7890, visit example.com or [email protected]. |
|
My SSN is 123-45-6789. Meet me at 123 Gary Crant, in Casablanca |
|
""" |