Spaces:

Straive-Kripa
/

redux

Build error

App Files Files Community

redux / app.py

Straive-Kripa

Update app.py

b04e78f verified 9 months ago

raw

history blame contribute delete

4.26 kB

	import gradio as gr
	import spacy
	import re
	from collections import Counter

	# Load the spaCy model, confirm that additional download has been made
	nlp = spacy.load("en_core_web_sm")

	def anonymize_text_including_proper_nouns_and_addresses(text):
	"""
	Anonymizes text by replacing occurrences of specific PII types and proper nouns, including potential physical addresses,
	with unique placeholders. It utilizes both regex for specific PII patterns and spaCy for NER to identify proper nouns.

	Parameters:
	- text: The input text string to be anonymized.
	- pii_config: A dictionary specifying the PII types to anonymize, where each key is a PII type (e.g., 'phone_number')
	and each value is a tuple containing a regex pattern to identify the PII and a placeholder string for its anonymization.

	Returns:
	- anonymized_text: The text after PII and proper nouns have been anonymized.
	- lookup_table: A dictionary mapping the anonymized placeholders back to the original text strings, enabling potential restoration.
	"""
	anonymized_text = text
	lookup_table = {}
	counters = Counter()
	pii_config = {
	'phone_number': (r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', 'PHONE__'),
	'text_address': (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', 'EMAIL__'),
	'social_security_number': (r'\b\d{3}-\d{2}-\d{4}\b', 'SSN__'),
	'website': (r'\b(?:http://\|https://)?(?:www\.)?[a-zA-Z0-9./]+\.[a-z]{2,}\b', 'WEBSITE__')
	}

	# First, handle regex-based PII detection and anonymization
	for pii_type, (pattern, placeholder) in pii_config.items():
	for match in re.finditer(pattern, text):
	original = match.group()
	counters[pii_type] += 1
	placeholder_with_counter = f"{placeholder}{counters[pii_type]}"
	anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
	lookup_table[placeholder_with_counter] = original

	# Process the text with spaCy for NER and part-of-speech tagging
	doc = nlp(anonymized_text)
	# Sort entities and tokens to replace longer phrases first to avoid nested replacement issues
	ents_and_tokens = sorted(list(doc.ents) + [token for token in doc if token.pos_ == "PROPN"], key=lambda x: -len(x.text))

	for ent_or_token in ents_and_tokens:
	if isinstance(ent_or_token, spacy.tokens.Token):
	# For individual proper nouns (tokens), we use the 'PROPN' label
	label = 'PROPN'
	else:
	# For recognized named entities
	label = ent_or_token.label_

	if label in ['ORG', 'PERSON', 'GPE', 'LOC', 'FAC', 'PROPN']: # Optionally include'PROPN' for more restriction
	original = ent_or_token.text
	counters[label] += 1
	placeholder_with_counter = f"{label}__{counters[label]}"
	if original in anonymized_text: # Check if the text still contains the original
	anonymized_text = anonymized_text.replace(original, placeholder_with_counter, 1)
	lookup_table[placeholder_with_counter] = original

	return anonymized_text

	sample_text_1 = """
	Contact John Doe at 123-456-7890, visit example.com or [email protected].
	My SSN is 123-45-6789. Meet me at 123 Gary Crant, in Casablanca
	"""
	sample_text_2 = """
	Hello,

	I'm so sorry for the late response.

	The title of the paper is: The Strange Image of Despondent Banking in the United States (1940-2052): An NLP-based Analysis.

	And I am posting the annotation below.

	Abstract: The working paper analyses hundreds of articles from the most influential periodicals over the past 32 years using Natural Language Processing (NLP) techniques. The analysis focuses on content analysis, specifically sentiment analysis, and a comparison with the performance of correspondent banking. Archival sources, including digital versions of newspapers and magazines, were analysed using Python programming and libraries such as NLTK, TextBlob, and VADER.

	Thank you very much.

	Sincerely, Paul Haggerty
	"""

	demo = gr.Interface(
	fn=anonymize_text_including_proper_nouns_and_addresses,
	inputs=["text"],
	outputs=["text"],
	examples=[[sample_text_1], [sample_text_2]]
	)

	demo.launch()