import re import gradio as gr from gliner import GLiNER from cerberus import Validator # ---------------------------------------------------------------------------- # Load model + labels # ---------------------------------------------------------------------------- model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1") with open("labels.txt", "r", encoding="utf-8") as f: labels = [line.strip() for line in f.readlines()] # ---------------------------------------------------------------------------- # Simple Cerberus validation for incoming data # ---------------------------------------------------------------------------- # We expect a dict with at least {"text": ""} schema = { "text": { "type": "string", "empty": False } } validator = Validator(schema) def validate_input(data: dict) -> str: """Validate that data has a non-empty 'text' key.""" if not validator.validate(data): # If invalid, raise an exception. You could handle this more gracefully if you like. raise ValueError(f"Invalid input data. Errors: {validator.errors}") return data["text"] # ---------------------------------------------------------------------------- # Core anonymize / de-anonymize logic (same as before) # ---------------------------------------------------------------------------- def anonymize_text(text): """ 1) Detect PII using GLiNER, 2) Replace each entity with a placeholder () 3) Return anonymized_text + entity_map """ entities = model.predict_entities(text, labels=labels, threshold=0.2) # Sort by start index to apply placeholders in correct order entities.sort(key=lambda e: e['start']) entity_map = {} # e.g. {'PERSON': ['Alice', 'Bob']} anonymized_text = "" next_start = 0 for entity in entities: label = entity['label'].replace(" ", "_").upper() original_text = entity['text'] start_idx, end_idx = entity['start'], entity['end'] if label not in entity_map: entity_map[label] = [original_text] idx = 1 else: # If same exact string repeated, use the same index as before if original_text in entity_map[label]: idx = entity_map[label].index(original_text) + 1 else: entity_map[label].append(original_text) idx = len(entity_map[label]) # Copy everything before this entity anonymized_text += text[next_start:start_idx] # Insert placeholder anonymized_text += f"" next_start = end_idx # Remainder of the text after last entity anonymized_text += text[next_start:] return anonymized_text, entity_map def deanonymize_text(anonymized_response, entity_map): """ Replace placeholders in anonymized_response with their original strings from entity_map. """ def replace_match(match): label = match.group(1) # e.g. "PERSON" idx_str = match.group(2) # e.g. "1" idx = int(idx_str) - 1 # 1-based index -> 0-based list index if label in entity_map and 0 <= idx < len(entity_map[label]): return entity_map[label][idx] return match.group(0) # If something is off, return the placeholder as-is pattern = r"" return re.sub(pattern, replace_match, anonymized_response) # ---------------------------------------------------------------------------- # Gradio Interface # ---------------------------------------------------------------------------- def anonymize_fn(original_text): # We’ll do a simple dict so we can pass it to our Cerberus validator: data = {"text": original_text} try: user_text = validate_input(data) except ValueError as e: # If invalid, show error in Gradio output return "", {}, f"Validation error: {str(e)}" anonymized, entities = anonymize_text(user_text) return anonymized, entities, "Anonymized successfully!" def deanonymize_fn(anonymized_llm_response, entity_map): if not anonymized_llm_response.strip(): return "", "Please provide an anonymized LLM response." if not entity_map: return "", "No entity map found; anonymize some text first." result = deanonymize_text(anonymized_llm_response, entity_map) return result, "De-anonymized successfully!" md_text = """# Anonymizing LLM Prompts Paste text into "Original Text" section to remove sensitive information, using `gliner_multi_pii-v1` for recognition. The demo is adapted from [Elara](https://github.com/amanvirparhar/elara) by amanvirparhar. If you like this one, give the original a star! """ with gr.Blocks() as demo: gr.Markdown(md_text) with gr.Row(): with gr.Column(): original_text = gr.Textbox( lines=6, label="Original Text (Anonymize)" ) anonymized_text = gr.Textbox( lines=6, label="Anonymized Text", interactive=False ) button_anon = gr.Button("Anonymize") # Hidden state to store the entity map entity_map_state = gr.State() message_out = gr.Textbox(label="Status", interactive=False) button_anon.click( anonymize_fn, inputs=[original_text], outputs=[anonymized_text, entity_map_state, message_out] ) with gr.Column(): anonymized_llm_response = gr.Textbox( lines=6, label="Anonymized LLM Response (Paste here)" ) deanonymized_text = gr.Textbox( lines=6, label="De-anonymized LLM Response", interactive=False ) button_deanon = gr.Button("De-anonymize") message_out_de = gr.Textbox(label="Status", interactive=False) button_deanon.click( deanonymize_fn, inputs=[anonymized_llm_response, entity_map_state], outputs=[deanonymized_text, message_out_de] ) if __name__ == "__main__": demo.launch()