Update pii_anonymizer.py
Browse files- pii_anonymizer.py +60 -1
pii_anonymizer.py
CHANGED
@@ -3,4 +3,63 @@ import re
|
|
3 |
from collections import Counter
|
4 |
|
5 |
# Load the spaCy model, confirm that additional download has been made
|
6 |
-
nlp = spacy.load("en_core_web_sm")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from collections import Counter
|
4 |
|
5 |
# Load the spaCy model, confirm that additional download has been made
|
6 |
+
nlp = spacy.load("en_core_web_sm")
|
7 |
+
|
8 |
+
def anonymize_text_including_proper_nouns_and_addresses(text):
|
9 |
+
"""
|
10 |
+
Anonymizes text by replacing occurrences of specific PII types and proper nouns, including potential physical addresses,
|
11 |
+
with unique placeholders. It utilizes both regex for specific PII patterns and spaCy for NER to identify proper nouns.
|
12 |
+
Parameters:
|
13 |
+
- text: The input text string to be anonymized.
|
14 |
+
- pii_config: A dictionary specifying the PII types to anonymize, where each key is a PII type (e.g., 'phone_number')
|
15 |
+
and each value is a tuple containing a regex pattern to identify the PII and a placeholder string for its anonymization.
|
16 |
+
Returns:
|
17 |
+
- anonymized_text: The text after PII and proper nouns have been anonymized.
|
18 |
+
- lookup_table: A dictionary mapping the anonymized placeholders back to the original text strings, enabling potential restoration.
|
19 |
+
"""
|
20 |
+
anonymized_text = text
|
21 |
+
lookup_table = {}
|
22 |
+
counters = Counter()
|
23 |
+
pii_config = {
|
24 |
+
'phone_number': (r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', 'PHONE__'),
|
25 |
+
'text_address': (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'EMAIL__'),
|
26 |
+
'social_security_number': (r'\b\d{3}-\d{2}-\d{4}\b', 'SSN__'),
|
27 |
+
'website': (r'\b(?:http://|https://)?(?:www\.)?[a-zA-Z0-9./]+\.[a-z]{2,}\b', 'WEBSITE__')
|
28 |
+
}
|
29 |
+
|
30 |
+
# First, handle regex-based PII detection and anonymization
|
31 |
+
for pii_type, (pattern, placeholder) in pii_config.items():
|
32 |
+
for match in re.finditer(pattern, text):
|
33 |
+
original = match.group()
|
34 |
+
counters[pii_type] += 1
|
35 |
+
placeholder_with_counter = f"{placeholder}{counters[pii_type]}"
|
36 |
+
anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
|
37 |
+
lookup_table[placeholder_with_counter] = original
|
38 |
+
|
39 |
+
# Process the text with spaCy for NER and part-of-speech tagging
|
40 |
+
doc = nlp(anonymized_text)
|
41 |
+
# Sort entities and tokens to replace longer phrases first to avoid nested replacement issues
|
42 |
+
ents_and_tokens = sorted(list(doc.ents) + [token for token in doc if token.pos_ == "PROPN"], key=lambda x: -len(x.text))
|
43 |
+
|
44 |
+
for ent_or_token in ents_and_tokens:
|
45 |
+
if isinstance(ent_or_token, spacy.tokens.Token):
|
46 |
+
# For individual proper nouns (tokens), we use the 'PROPN' label
|
47 |
+
label = 'PROPN'
|
48 |
+
else:
|
49 |
+
# For recognized named entities
|
50 |
+
label = ent_or_token.label_
|
51 |
+
|
52 |
+
if label in ['ORG', 'PERSON', 'GPE', 'LOC', 'FAC', 'PROPN']: # Optionally include'PROPN' for more restriction
|
53 |
+
original = ent_or_token.text
|
54 |
+
counters[label] += 1
|
55 |
+
placeholder_with_counter = f"{label}__{counters[label]}"
|
56 |
+
if original in anonymized_text: # Check if the text still contains the original
|
57 |
+
anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
|
58 |
+
lookup_table[placeholder_with_counter] = original
|
59 |
+
|
60 |
+
return anonymized_text
|
61 |
+
|
62 |
+
sample_text_1 = """
|
63 |
+
Contact John Doe at 123-456-7890, visit example.com or [email protected].
|
64 |
+
My SSN is 123-45-6789. Meet me at 123 Gary Crant, in Casablanca
|
65 |
+
"""
|