Straive-Kripa commited on
Commit
199fec8
·
verified ·
1 Parent(s): bb9019d

Update pii_anonymizer.py

Browse files
Files changed (1) hide show
  1. pii_anonymizer.py +60 -1
pii_anonymizer.py CHANGED
@@ -3,4 +3,63 @@ import re
3
  from collections import Counter
4
 
5
  # Load the spaCy model, confirm that additional download has been made
6
- nlp = spacy.load("en_core_web_sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from collections import Counter
4
 
5
  # Load the spaCy model, confirm that additional download has been made
6
+ nlp = spacy.load("en_core_web_sm")
7
+
8
+ def anonymize_text_including_proper_nouns_and_addresses(text):
9
+ """
10
+ Anonymizes text by replacing occurrences of specific PII types and proper nouns, including potential physical addresses,
11
+ with unique placeholders. It utilizes both regex for specific PII patterns and spaCy for NER to identify proper nouns.
12
+ Parameters:
13
+ - text: The input text string to be anonymized.
14
+ - pii_config: A dictionary specifying the PII types to anonymize, where each key is a PII type (e.g., 'phone_number')
15
+ and each value is a tuple containing a regex pattern to identify the PII and a placeholder string for its anonymization.
16
+ Returns:
17
+ - anonymized_text: The text after PII and proper nouns have been anonymized.
18
+ - lookup_table: A dictionary mapping the anonymized placeholders back to the original text strings, enabling potential restoration.
19
+ """
20
+ anonymized_text = text
21
+ lookup_table = {}
22
+ counters = Counter()
23
+ pii_config = {
24
+ 'phone_number': (r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', 'PHONE__'),
25
+ 'text_address': (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'EMAIL__'),
26
+ 'social_security_number': (r'\b\d{3}-\d{2}-\d{4}\b', 'SSN__'),
27
+ 'website': (r'\b(?:http://|https://)?(?:www\.)?[a-zA-Z0-9./]+\.[a-z]{2,}\b', 'WEBSITE__')
28
+ }
29
+
30
+ # First, handle regex-based PII detection and anonymization
31
+ for pii_type, (pattern, placeholder) in pii_config.items():
32
+ for match in re.finditer(pattern, text):
33
+ original = match.group()
34
+ counters[pii_type] += 1
35
+ placeholder_with_counter = f"{placeholder}{counters[pii_type]}"
36
+ anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
37
+ lookup_table[placeholder_with_counter] = original
38
+
39
+ # Process the text with spaCy for NER and part-of-speech tagging
40
+ doc = nlp(anonymized_text)
41
+ # Sort entities and tokens to replace longer phrases first to avoid nested replacement issues
42
+ ents_and_tokens = sorted(list(doc.ents) + [token for token in doc if token.pos_ == "PROPN"], key=lambda x: -len(x.text))
43
+
44
+ for ent_or_token in ents_and_tokens:
45
+ if isinstance(ent_or_token, spacy.tokens.Token):
46
+ # For individual proper nouns (tokens), we use the 'PROPN' label
47
+ label = 'PROPN'
48
+ else:
49
+ # For recognized named entities
50
+ label = ent_or_token.label_
51
+
52
+ if label in ['ORG', 'PERSON', 'GPE', 'LOC', 'FAC', 'PROPN']: # Optionally include'PROPN' for more restriction
53
+ original = ent_or_token.text
54
+ counters[label] += 1
55
+ placeholder_with_counter = f"{label}__{counters[label]}"
56
+ if original in anonymized_text: # Check if the text still contains the original
57
+ anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
58
+ lookup_table[placeholder_with_counter] = original
59
+
60
+ return anonymized_text
61
+
62
+ sample_text_1 = """
63
+ Contact John Doe at 123-456-7890, visit example.com or [email protected].
64
+ My SSN is 123-45-6789. Meet me at 123 Gary Crant, in Casablanca
65
+ """