trojblue commited on
Commit
337fbc2
·
1 Parent(s): 12fe978

adding space files

Browse files
Files changed (3) hide show
  1. app.py +177 -0
  2. labels.txt +41 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import gradio as gr
3
+ from gliner import GLiNER
4
+ from cerberus import Validator
5
+
6
+ # ----------------------------------------------------------------------------
7
+ # Load model + labels
8
+ # ----------------------------------------------------------------------------
9
+
10
+ model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")
11
+
12
+ with open("labels.txt", "r", encoding="utf-8") as f:
13
+ labels = [line.strip() for line in f.readlines()]
14
+
15
+ # ----------------------------------------------------------------------------
16
+ # Simple Cerberus validation for incoming data
17
+ # ----------------------------------------------------------------------------
18
+
19
+ # We expect a dict with at least {"text": "<some string>"}
20
+ schema = {
21
+ "text": {
22
+ "type": "string",
23
+ "empty": False
24
+ }
25
+ }
26
+
27
+ validator = Validator(schema)
28
+
29
+
30
+ def validate_input(data: dict) -> str:
31
+ """Validate that data has a non-empty 'text' key."""
32
+ if not validator.validate(data):
33
+ # If invalid, raise an exception. You could handle this more gracefully if you like.
34
+ raise ValueError(f"Invalid input data. Errors: {validator.errors}")
35
+ return data["text"]
36
+
37
+ # ----------------------------------------------------------------------------
38
+ # Core anonymize / de-anonymize logic (same as before)
39
+ # ----------------------------------------------------------------------------
40
+
41
+
42
+ def anonymize_text(text):
43
+ """
44
+ 1) Detect PII using GLiNER,
45
+ 2) Replace each entity with a placeholder (<PII_LABEL_INDEX>)
46
+ 3) Return anonymized_text + entity_map
47
+ """
48
+ entities = model.predict_entities(text, labels=labels, threshold=0.2)
49
+ # Sort by start index to apply placeholders in correct order
50
+ entities.sort(key=lambda e: e['start'])
51
+
52
+ entity_map = {} # e.g. {'PERSON': ['Alice', 'Bob']}
53
+ anonymized_text = ""
54
+ next_start = 0
55
+
56
+ for entity in entities:
57
+ label = entity['label'].replace(" ", "_").upper()
58
+ original_text = entity['text']
59
+ start_idx, end_idx = entity['start'], entity['end']
60
+
61
+ if label not in entity_map:
62
+ entity_map[label] = [original_text]
63
+ idx = 1
64
+ else:
65
+ # If same exact string repeated, use the same index as before
66
+ if original_text in entity_map[label]:
67
+ idx = entity_map[label].index(original_text) + 1
68
+ else:
69
+ entity_map[label].append(original_text)
70
+ idx = len(entity_map[label])
71
+
72
+ # Copy everything before this entity
73
+ anonymized_text += text[next_start:start_idx]
74
+ # Insert placeholder
75
+ anonymized_text += f"<PII_{label}_{idx}>"
76
+ next_start = end_idx
77
+
78
+ # Remainder of the text after last entity
79
+ anonymized_text += text[next_start:]
80
+ return anonymized_text, entity_map
81
+
82
+
83
+ def deanonymize_text(anonymized_response, entity_map):
84
+ """
85
+ Replace <PII_LABEL_INDEX> placeholders in anonymized_response
86
+ with their original strings from entity_map.
87
+ """
88
+
89
+ def replace_match(match):
90
+ label = match.group(1) # e.g. "PERSON"
91
+ idx_str = match.group(2) # e.g. "1"
92
+ idx = int(idx_str) - 1 # 1-based index -> 0-based list index
93
+
94
+ if label in entity_map and 0 <= idx < len(entity_map[label]):
95
+ return entity_map[label][idx]
96
+ return match.group(0) # If something is off, return the placeholder as-is
97
+
98
+ pattern = r"<PII_(\w+)_(\d+)>"
99
+ return re.sub(pattern, replace_match, anonymized_response)
100
+
101
+ # ----------------------------------------------------------------------------
102
+ # Gradio Interface
103
+ # ----------------------------------------------------------------------------
104
+
105
+ def anonymize_fn(original_text):
106
+ # We’ll do a simple dict so we can pass it to our Cerberus validator:
107
+ data = {"text": original_text}
108
+ try:
109
+ user_text = validate_input(data)
110
+ except ValueError as e:
111
+ # If invalid, show error in Gradio output
112
+ return "", {}, f"Validation error: {str(e)}"
113
+
114
+ anonymized, entities = anonymize_text(user_text)
115
+ return anonymized, entities, "Anonymized successfully!"
116
+
117
+
118
+ def deanonymize_fn(anonymized_llm_response, entity_map):
119
+ if not anonymized_llm_response.strip():
120
+ return "", "Please provide an anonymized LLM response."
121
+ if not entity_map:
122
+ return "", "No entity map found; anonymize some text first."
123
+
124
+ result = deanonymize_text(anonymized_llm_response, entity_map)
125
+ return result, "De-anonymized successfully!"
126
+
127
+
128
+ md_text = """# Anonymizing LLM Prompts
129
+
130
+ Paste text into "Original Text" section to remove sensitive information, using `gliner_multi_pii-v1` for recognition.
131
+
132
+ The demo is adapted from [Elara](https://github.com/amanvirparhar/elara) by amanvirparhar. If you like this one, give the original a star!
133
+ """
134
+
135
+ with gr.Blocks() as demo:
136
+ gr.Markdown(md_text)
137
+
138
+ with gr.Row():
139
+ with gr.Column():
140
+ original_text = gr.Textbox(
141
+ lines=6, label="Original Text (Anonymize)"
142
+ )
143
+ anonymized_text = gr.Textbox(
144
+ lines=6, label="Anonymized Text", interactive=False
145
+ )
146
+ button_anon = gr.Button("Anonymize")
147
+
148
+ # Hidden state to store the entity map
149
+ entity_map_state = gr.State()
150
+
151
+ message_out = gr.Textbox(label="Status", interactive=False)
152
+
153
+ button_anon.click(
154
+ anonymize_fn,
155
+ inputs=[original_text],
156
+ outputs=[anonymized_text, entity_map_state, message_out]
157
+ )
158
+
159
+ with gr.Column():
160
+ anonymized_llm_response = gr.Textbox(
161
+ lines=6, label="Anonymized LLM Response (Paste here)"
162
+ )
163
+ deanonymized_text = gr.Textbox(
164
+ lines=6, label="De-anonymized LLM Response", interactive=False
165
+ )
166
+ button_deanon = gr.Button("De-anonymize")
167
+
168
+ message_out_de = gr.Textbox(label="Status", interactive=False)
169
+
170
+ button_deanon.click(
171
+ deanonymize_fn,
172
+ inputs=[anonymized_llm_response, entity_map_state],
173
+ outputs=[deanonymized_text, message_out_de]
174
+ )
175
+
176
+ if __name__ == "__main__":
177
+ demo.launch()
labels.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ person
2
+ organization
3
+ phone number
4
+ address
5
+ email
6
+ ip address
7
+ username
8
+ date of birth
9
+ passport number
10
+ credit card number
11
+ social security number
12
+ health insurance id number
13
+ mobile phone number
14
+ bank account number
15
+ driver's license number
16
+ tax identification number
17
+ identity card number
18
+ national id number
19
+ registration number
20
+ student id number
21
+ insurance number
22
+ serial number
23
+ fax number
24
+ visa number
25
+ identity document number
26
+ transaction number
27
+ credit card brand
28
+ license plate number
29
+ vehicle registration number
30
+ landline phone number
31
+ blood type
32
+ cvv
33
+ cvc
34
+ digital signature
35
+ postal code
36
+ insurance company
37
+ passport expiration date
38
+ medication
39
+ medical condition
40
+ credit card expiration date
41
+ api key
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ gliner
3
+ cerberus