File size: 4,259 Bytes
90f04dd
6f448bd
133b5ab
3bfd7f7
1694b5c
6f448bd
c51d7db
 
49cfc9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1d685e
 
 
 
49cfc9b
 
 
133b5ab
49cfc9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43f4975
49cfc9b
 
a1d685e
49cfc9b
 
 
c51d7db
49cfc9b
c51d7db
d5cef5c
3bfd7f7
 
 
d5cef5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133b5ab
49cfc9b
133b5ab
 
0940774
1694b5c
133b5ab
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import spacy 
import re
from collections import Counter

# Load the spaCy model, confirm that additional download has been made
nlp = spacy.load("en_core_web_sm")

def anonymize_text_including_proper_nouns_and_addresses(text):
    """
    Anonymizes text by replacing occurrences of specific PII types and proper nouns, including potential physical addresses, 
    with unique placeholders. It utilizes both regex for specific PII patterns and spaCy for NER to identify proper nouns.

    Parameters:
    - text: The input text string to be anonymized.
    - pii_config: A dictionary specifying the PII types to anonymize, where each key is a PII type (e.g., 'phone_number') 
      and each value is a tuple containing a regex pattern to identify the PII and a placeholder string for its anonymization.

    Returns:
    - anonymized_text: The text after PII and proper nouns have been anonymized.
    - lookup_table: A dictionary mapping the anonymized placeholders back to the original text strings, enabling potential restoration.
    """
    anonymized_text = text
    lookup_table = {}
    counters = Counter()
    pii_config = {
    'phone_number': (r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', 'PHONE__'),
    'text_address': (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'EMAIL__'),
    'social_security_number': (r'\b\d{3}-\d{2}-\d{4}\b', 'SSN__'),
    'website': (r'\b(?:http://|https://)?(?:www\.)?[a-zA-Z0-9./]+\.[a-z]{2,}\b', 'WEBSITE__')
    }
    
    # First, handle regex-based PII detection and anonymization
    for pii_type, (pattern, placeholder) in pii_config.items():
        for match in re.finditer(pattern, text):
            original = match.group()
            counters[pii_type] += 1
            placeholder_with_counter = f"{placeholder}{counters[pii_type]}"
            anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
            lookup_table[placeholder_with_counter] = original

    # Process the text with spaCy for NER and part-of-speech tagging
    doc = nlp(anonymized_text)
    # Sort entities and tokens to replace longer phrases first to avoid nested replacement issues
    ents_and_tokens = sorted(list(doc.ents) + [token for token in doc if token.pos_ == "PROPN"], key=lambda x: -len(x.text))

    for ent_or_token in ents_and_tokens:
        if isinstance(ent_or_token, spacy.tokens.Token):
            # For individual proper nouns (tokens), we use the 'PROPN' label
            label = 'PROPN'
        else:
            # For recognized named entities
            label = ent_or_token.label_
        
        if label in ['ORG', 'PERSON', 'GPE', 'LOC', 'FAC', 'PROPN']: # Optionally include'PROPN' for more restriction
            original = ent_or_token.text
            counters[label] += 1
            placeholder_with_counter = f"{label}__{counters[label]}"
            if original in anonymized_text:  # Check if the text still contains the original
                anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
                lookup_table[placeholder_with_counter] = original

    return anonymized_text

sample_text_1 = """
Contact John Doe at 123-456-7890, visit example.com or [email protected]. 
My SSN is 123-45-6789. Meet me at 123 Gary Crant, in Casablanca
"""
sample_text_2 = """
Hello,

I'm so sorry for the late response.

The title of the paper is: The Strange Image of Despondent Banking in the United States (1940-2052): An NLP-based Analysis.

And I am posting the annotation below.

Abstract: The working paper analyses hundreds of articles from the most influential periodicals over the past 32 years using Natural Language Processing (NLP) techniques. The analysis focuses on content analysis, specifically sentiment analysis, and a comparison with the performance of correspondent banking. Archival sources, including digital versions of newspapers and magazines, were analysed using Python programming and libraries such as NLTK, TextBlob, and VADER.

Thank you very much.

Sincerely, Paul Haggerty
"""

demo = gr.Interface(
    fn=anonymize_text_including_proper_nouns_and_addresses,
    inputs=["text"],
    outputs=["text"],
    examples=[[sample_text_1], [sample_text_2]]
)

demo.launch()