CarolXia commited on
Commit
e50bd12
·
1 Parent(s): 5f6c9e2

PII classification

Browse files
Files changed (2) hide show
  1. app.py +181 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ # from gliner import GLiNER
3
+ from datasets import load_dataset
4
+ from peft import PeftModel, PeftConfig
5
+ import threading
6
+ import time
7
+ import torch
8
+ from torch.profiler import profile, record_function, ProfilerActivity
9
+ from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
10
+
11
+ def predict_entities(text, labels, entity_set):
12
+ if labels == []:
13
+ entities = recognizer(text)
14
+ for entity in entities:
15
+ if entity['entity'] in entity_set:
16
+ entity_set[entity['entity']] += 1
17
+ else:
18
+ entity_set[entity['entity']] = 1
19
+ else:
20
+ # Use Gliner labels
21
+ entities = model.predict_entities(text, labels, threshold = 0.7)
22
+ for entity in entities:
23
+ if entity['label'] in entity_set:
24
+ entity_set[entity['label']] += 1
25
+ else:
26
+ entity_set[entity['label']] = 1
27
+
28
+ def process_datasets(start, end, unmasked_text, sizes, index, entity_set, labels):
29
+ size = 0
30
+ text = ""
31
+ for i in range(start, end):
32
+ if len(text) < 700:
33
+ text = text + " " + unmasked_text[i]
34
+ else:
35
+ size += len(text)
36
+ predict_entities(text, labels, entity_set)
37
+
38
+ text = unmasked_text[i]
39
+ sizes[index] = size
40
+
41
+ device = torch.device('cpu')
42
+ print(f"Is CUDA available: {torch.cuda.is_available()}")
43
+ # True
44
+ if torch.cuda.is_available():
45
+ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
46
+ device = torch.device('cuda')
47
+
48
+ # Load the fine-tuned GLiNER model
49
+ st.write('Loading the pretrained model ...')
50
+ base_model_name = "iiiorg/piiranha-v1-detect-personal-information"
51
+ adapter_model_name = "CarolXia/xia-lora-deberta-v2"
52
+ # config = PeftConfig.from_pretrained(model_name)
53
+ base_model = AutoModelForTokenClassification.from_pretrained(base_model_name)
54
+ if torch.cuda.is_available():
55
+ base_model = base_model.to("cuda")
56
+ adapter_model = PeftModel.from_pretrained(base_model, adapter_model_name, token=st.secrets["HUGGINGFACE_TOKEN"])
57
+ # Try quantization instead
58
+ # model = AutoModelForTokenClassification.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
59
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
60
+
61
+ model = adapter_model.merge_and_unload()
62
+ model.save_pretrained("./merged_model")
63
+ if torch.cuda.is_available():
64
+ model = model.to("cuda")
65
+
66
+ recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
67
+ # model_name = "urchade/gliner_multi_pii-v1"
68
+ # model = GLiNER.from_pretrained(model_name)
69
+
70
+ # print weights
71
+ pytorch_total_params = sum(p.numel() for p in model.parameters())
72
+ torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
73
+ print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
74
+
75
+
76
+
77
+ # Sample text containing PII/PHI entities
78
+ text = """
79
+ Hello Jane Doe. Your AnyCompany Financial Services, LLC credit card account
80
+ 4111-0000-1111-0000 has a minimum payment of $24.53 that is due by July 31st.
81
+ Based on your autopay settings, we will withdraw your payment on the due date from
82
+ your bank account XXXXXX1111 with the routing number XXXXX0000.
83
+
84
+ Your latest statement was mailed to 100 Main Street, Anytown, WA 98121.
85
+ After your payment is received, you will receive a confirmation text message
86
+ at 206-555-0100.
87
+
88
+ If you have questions about your bill, AnyCompany Customer Service is available by
89
+ phone at 206-555-0199 or email at [email protected].
90
+ """
91
+
92
+ # Define the labels for PII/PHI entities
93
+ labels = [
94
+ "medical_record_number",
95
+ "date_of_birth",
96
+ "ssn",
97
+ "date",
98
+ "first_name",
99
+ "email",
100
+ "last_name",
101
+ "customer_id",
102
+ "employee_id",
103
+ "name",
104
+ "street_address",
105
+ "phone_number",
106
+ "ipv4",
107
+ "credit_card_number",
108
+ "license_plate",
109
+ "address",
110
+ "user_name",
111
+ "device_identifier",
112
+ "bank_routing_number",
113
+ "date_time",
114
+ "company_name",
115
+ "unique_identifier",
116
+ "biometric_identifier",
117
+ "account_number",
118
+ "city",
119
+ "certificate_license_number",
120
+ "time",
121
+ "postcode",
122
+ "vehicle_identifier",
123
+ "coordinate",
124
+ "country",
125
+ "api_key",
126
+ "ipv6",
127
+ "password",
128
+ "health_plan_beneficiary_number",
129
+ "national_id",
130
+ "tax_id",
131
+ "url",
132
+ "state",
133
+ "swift_bic",
134
+ "cvv",
135
+ "pin"
136
+ ]
137
+
138
+ st.write('Trying a sample first')
139
+ st.write(text)
140
+ # Predict entities with a confidence threshold of 0.7
141
+ # entities = model.predict_entities(text, labels, threshold=0.7)
142
+ entities = recognizer(text)
143
+
144
+ # Display the detected entities
145
+ for entity in entities:
146
+ st.write(entity)
147
+
148
+ st.write('Processing the full dataset now ...')
149
+ entity_set=dict()
150
+ dataset = load_dataset("Isotonic/pii-masking-200k", split="train")
151
+ dataset = dataset.select(range(50))
152
+ dataset.set_format("torch", device="cuda")
153
+ unmasked_text = dataset['unmasked_text'] # This will load the entire column inmemory. Must do this to avoid I/O delay later
154
+
155
+ st.write('Size of the dataset ', dataset.num_rows)
156
+ sizes = [0] * 2
157
+ start = time.time()
158
+ t1 = threading.Thread(target=process_datasets, args=(0, 25, unmasked_text, sizes, 0, entity_set, []))
159
+ t2 = threading.Thread(target=process_datasets, args=(25, 50, unmasked_text, sizes, 1, entity_set, []))
160
+ # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
161
+ process_datasets(0, 50, unmasked_text, sizes, 0, entity_set, [])
162
+ # recognizer(unmasked_text)
163
+ # t1.start()
164
+ # t2.start()
165
+
166
+ # t1.join()
167
+ # t2.join()
168
+
169
+ end = time.time()
170
+ length = end - start
171
+
172
+ # Show the results : this can be altered however you like
173
+ st.write('Bytes processed ', sum(sizes))
174
+ st.write("It took", length, "seconds!")
175
+
176
+ # Display the summary
177
+ st.write('Total entities found')
178
+ for key in entity_set:
179
+ st.write(key, ' => ', entity_set[key])
180
+
181
+ # st.write(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ datasets
2
+ gliner
3
+ peft
4
+ torch>=2.0.0
5
+ transformers>=4.38.2
6
+ huggingface_hub>=0.21.4
7
+ onnxruntime
8
+ sentencepiece
9
+ tqdm