Spaces:

trojblue
/

anonymizing-llm-prompts

Running

App Files Files Community

trojblue commited on 16 days ago

Commit

337fbc2

1 Parent(s): 12fe978

adding space files

Browse files

Files changed (3) hide show

app.py +177 -0
labels.txt +41 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import re
+import gradio as gr
+from gliner import GLiNER
+from cerberus import Validator
+# ----------------------------------------------------------------------------
+# Load model + labels
+# ----------------------------------------------------------------------------
+model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
+with open("labels.txt", "r", encoding="utf-8") as f:
+    labels = [line.strip() for line in f.readlines()]
+# ----------------------------------------------------------------------------
+# Simple Cerberus validation for incoming data
+# ----------------------------------------------------------------------------
+# We expect a dict with at least {"text": "<some string>"}
+schema = {
+    "text": {
+        "type": "string",
+        "empty": False
+    }
+}
+validator = Validator(schema)
+def validate_input(data: dict) -> str:
+    """Validate that data has a non-empty 'text' key."""
+    if not validator.validate(data):
+        # If invalid, raise an exception. You could handle this more gracefully if you like.
+        raise ValueError(f"Invalid input data. Errors: {validator.errors}")
+    return data["text"]
+# ----------------------------------------------------------------------------
+# Core anonymize / de-anonymize logic (same as before)
+# ----------------------------------------------------------------------------
+def anonymize_text(text):
+    """
+    1) Detect PII using GLiNER,
+    2) Replace each entity with a placeholder (<PII_LABEL_INDEX>)
+    3) Return anonymized_text + entity_map
+    """
+    entities = model.predict_entities(text, labels=labels, threshold=0.2)
+    # Sort by start index to apply placeholders in correct order
+    entities.sort(key=lambda e: e['start'])
+    entity_map = {}  # e.g. {'PERSON': ['Alice', 'Bob']}
+    anonymized_text = ""
+    next_start = 0
+    for entity in entities:
+        label = entity['label'].replace(" ", "_").upper()
+        original_text = entity['text']
+        start_idx, end_idx = entity['start'], entity['end']
+        if label not in entity_map:
+            entity_map[label] = [original_text]
+            idx = 1
+        else:
+            # If same exact string repeated, use the same index as before
+            if original_text in entity_map[label]:
+                idx = entity_map[label].index(original_text) + 1
+            else:
+                entity_map[label].append(original_text)
+                idx = len(entity_map[label])
+        # Copy everything before this entity
+        anonymized_text += text[next_start:start_idx]
+        # Insert placeholder
+        anonymized_text += f"<PII_{label}_{idx}>"
+        next_start = end_idx
+    # Remainder of the text after last entity
+    anonymized_text += text[next_start:]
+    return anonymized_text, entity_map
+def deanonymize_text(anonymized_response, entity_map):
+    """
+    Replace <PII_LABEL_INDEX> placeholders in anonymized_response
+    with their original strings from entity_map.
+    """
+    def replace_match(match):
+        label = match.group(1)  # e.g. "PERSON"
+        idx_str = match.group(2)  # e.g. "1"
+        idx = int(idx_str) - 1    # 1-based index -> 0-based list index
+        if label in entity_map and 0 <= idx < len(entity_map[label]):
+            return entity_map[label][idx]
+        return match.group(0)  # If something is off, return the placeholder as-is
+    pattern = r"<PII_(\w+)_(\d+)>"
+    return re.sub(pattern, replace_match, anonymized_response)
+# ----------------------------------------------------------------------------
+# Gradio Interface
+# ----------------------------------------------------------------------------
+def anonymize_fn(original_text):
+    # We’ll do a simple dict so we can pass it to our Cerberus validator:
+    data = {"text": original_text}
+    try:
+        user_text = validate_input(data)
+    except ValueError as e:
+        # If invalid, show error in Gradio output
+        return "", {}, f"Validation error: {str(e)}"
+    anonymized, entities = anonymize_text(user_text)
+    return anonymized, entities, "Anonymized successfully!"
+def deanonymize_fn(anonymized_llm_response, entity_map):
+    if not anonymized_llm_response.strip():
+        return "", "Please provide an anonymized LLM response."
+    if not entity_map:
+        return "", "No entity map found; anonymize some text first."
+    result = deanonymize_text(anonymized_llm_response, entity_map)
+    return result, "De-anonymized successfully!"
+md_text = """# Anonymizing LLM Prompts
+Paste text into "Original Text" section to remove sensitive information, using `gliner_multi_pii-v1` for recognition.
+The demo is adapted from [Elara](https://github.com/amanvirparhar/elara) by amanvirparhar. If you like this one, give the original a star!
+"""
+with gr.Blocks() as demo:
+    gr.Markdown(md_text)
+    with gr.Row():
+        with gr.Column():
+            original_text = gr.Textbox(
+                lines=6, label="Original Text (Anonymize)"
+            )
+            anonymized_text = gr.Textbox(
+                lines=6, label="Anonymized Text", interactive=False
+            )
+            button_anon = gr.Button("Anonymize")
+            # Hidden state to store the entity map
+            entity_map_state = gr.State()
+            message_out = gr.Textbox(label="Status", interactive=False)
+            button_anon.click(
+                anonymize_fn,
+                inputs=[original_text],
+                outputs=[anonymized_text, entity_map_state, message_out]
+            )
+        with gr.Column():
+            anonymized_llm_response = gr.Textbox(
+                lines=6, label="Anonymized LLM Response (Paste here)"
+            )
+            deanonymized_text = gr.Textbox(
+                lines=6, label="De-anonymized LLM Response", interactive=False
+            )
+            button_deanon = gr.Button("De-anonymize")
+            message_out_de = gr.Textbox(label="Status", interactive=False)
+            button_deanon.click(
+                deanonymize_fn,
+                inputs=[anonymized_llm_response, entity_map_state],
+                outputs=[deanonymized_text, message_out_de]
+            )
+if __name__ == "__main__":
+    demo.launch()

labels.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+person
+organization
+phone number
+address
+email
+ip address
+username
+date of birth
+passport number
+credit card number
+social security number
+health insurance id number
+mobile phone number
+bank account number
+driver's license number
+tax identification number
+identity card number
+national id number
+registration number
+student id number
+insurance number
+serial number
+fax number
+visa number
+identity document number
+transaction number
+credit card brand
+license plate number
+vehicle registration number
+landline phone number
+blood type
+cvv
+cvc
+digital signature
+postal code
+insurance company
+passport expiration date
+medication
+medical condition
+credit card expiration date
+api key

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio
+gliner
+cerberus