Spaces:

Straive-Kripa
/

scrub-file-rows-for-pii

Sleeping

App Files Files Community

Straive-Kripa commited on Mar 10, 2024

Commit

199fec8

verified ·

1 Parent(s): bb9019d

Update pii_anonymizer.py

Browse files

Files changed (1) hide show

pii_anonymizer.py +60 -1

pii_anonymizer.py CHANGED Viewed

@@ -3,4 +3,63 @@ import re
 from collections import Counter
 # Load the spaCy model, confirm that additional download has been made
-nlp = spacy.load("en_core_web_sm")

 from collections import Counter
 # Load the spaCy model, confirm that additional download has been made
+nlp = spacy.load("en_core_web_sm")
+def anonymize_text_including_proper_nouns_and_addresses(text):
+    """
+    Anonymizes text by replacing occurrences of specific PII types and proper nouns, including potential physical addresses,
+    with unique placeholders. It utilizes both regex for specific PII patterns and spaCy for NER to identify proper nouns.
+    Parameters:
+    - text: The input text string to be anonymized.
+    - pii_config: A dictionary specifying the PII types to anonymize, where each key is a PII type (e.g., 'phone_number')
+      and each value is a tuple containing a regex pattern to identify the PII and a placeholder string for its anonymization.
+    Returns:
+    - anonymized_text: The text after PII and proper nouns have been anonymized.
+    - lookup_table: A dictionary mapping the anonymized placeholders back to the original text strings, enabling potential restoration.
+    """
+    anonymized_text = text
+    lookup_table = {}
+    counters = Counter()
+    pii_config = {
+    'phone_number': (r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', 'PHONE__'),
+    'text_address': (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'EMAIL__'),
+    'social_security_number': (r'\b\d{3}-\d{2}-\d{4}\b', 'SSN__'),
+    'website': (r'\b(?:http://|https://)?(?:www\.)?[a-zA-Z0-9./]+\.[a-z]{2,}\b', 'WEBSITE__')
+    }
+    # First, handle regex-based PII detection and anonymization
+    for pii_type, (pattern, placeholder) in pii_config.items():
+        for match in re.finditer(pattern, text):
+            original = match.group()
+            counters[pii_type] += 1
+            placeholder_with_counter = f"{placeholder}{counters[pii_type]}"
+            anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
+            lookup_table[placeholder_with_counter] = original
+    # Process the text with spaCy for NER and part-of-speech tagging
+    doc = nlp(anonymized_text)
+    # Sort entities and tokens to replace longer phrases first to avoid nested replacement issues
+    ents_and_tokens = sorted(list(doc.ents) + [token for token in doc if token.pos_ == "PROPN"], key=lambda x: -len(x.text))
+    for ent_or_token in ents_and_tokens:
+        if isinstance(ent_or_token, spacy.tokens.Token):
+            # For individual proper nouns (tokens), we use the 'PROPN' label
+            label = 'PROPN'
+        else:
+            # For recognized named entities
+            label = ent_or_token.label_
+        if label in ['ORG', 'PERSON', 'GPE', 'LOC', 'FAC', 'PROPN']: # Optionally include'PROPN' for more restriction
+            original = ent_or_token.text
+            counters[label] += 1
+            placeholder_with_counter = f"{label}__{counters[label]}"
+            if original in anonymized_text:  # Check if the text still contains the original
+                anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
+                lookup_table[placeholder_with_counter] = original
+    return anonymized_text
+sample_text_1 = """
+Contact John Doe at 123-456-7890, visit example.com or [email protected].
+My SSN is 123-45-6789. Meet me at 123 Gary Crant, in Casablanca
+"""