CarolXia commited on
Commit
0dd2059
Β·
1 Parent(s): 6a05a51

Copied from pii-classification

Browse files
Files changed (3) hide show
  1. README.md +4 -5
  2. app.py +168 -0
  3. requirements.txt +9 -0
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
- title: Pii Classification Kd
3
- emoji: πŸ‘
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.40.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: KD classification
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Pii Classification
3
+ emoji: πŸš€
4
+ colorFrom: red
5
+ colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.40.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ # from gliner import GLiNER
3
+ from datasets import load_dataset
4
+ from peft import PeftModel, PeftConfig
5
+ import threading
6
+ import time
7
+ import torch
8
+ from torch.profiler import profile, record_function, ProfilerActivity
9
+ from transformers import DebertaV2ForTokenClassification, DebertaV2Tokenizer, pipeline
10
+
11
+ def predict_entities(text, labels, entity_set):
12
+ if labels == []:
13
+ entities = recognizer(text)
14
+ for entity in entities:
15
+ if entity['entity'] in entity_set:
16
+ entity_set[entity['entity']] += 1
17
+ else:
18
+ entity_set[entity['entity']] = 1
19
+ else:
20
+ # Use Gliner labels
21
+ entities = model.predict_entities(text, labels, threshold = 0.7)
22
+ for entity in entities:
23
+ if entity['label'] in entity_set:
24
+ entity_set[entity['label']] += 1
25
+ else:
26
+ entity_set[entity['label']] = 1
27
+
28
+ def process_datasets(start, end, unmasked_text, sizes, index, entity_set, labels):
29
+ size = 0
30
+ text = ""
31
+ for i in range(start, end):
32
+ if len(text) < 700:
33
+ text = text + " " + unmasked_text[i]
34
+ else:
35
+ size += len(text)
36
+ predict_entities(text, labels, entity_set)
37
+
38
+ text = unmasked_text[i]
39
+ sizes[index] = size
40
+
41
+ print(f"Is CUDA available: {torch.cuda.is_available()}")
42
+ # True
43
+ if torch.cuda.is_available():
44
+ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
45
+
46
+
47
+ # Load the fine-tuned GLiNER model
48
+ st.write('Loading the pretrained model ...')
49
+ model_name = "CarolXia/pii-kd-deberta-v2"
50
+ # config = PeftConfig.from_pretrained(model_name)
51
+ model = DebertaV2ForTokenClassification.from_pretrained(model_name, token=st.secrets["HUGGINGFACE_TOKEN"])
52
+ # Try quantization instead
53
+ # model = AutoModelForTokenClassification.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
54
+ tokenizer = DebertaV2Tokenizer.from_pretrained(model_name, token=st.secrets["HUGGINGFACE_TOKEN"])
55
+ recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
56
+ # model_name = "urchade/gliner_multi_pii-v1"
57
+ # model = GLiNER.from_pretrained(model_name)
58
+
59
+ # print weights
60
+ pytorch_total_params = sum(p.numel() for p in model.parameters())
61
+ torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
62
+ print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
63
+
64
+ if torch.cuda.is_available():
65
+ model = model.to("cuda")
66
+
67
+ # Sample text containing PII/PHI entities
68
+ text = """
69
+ Hello Jane Doe. Your AnyCompany Financial Services, LLC credit card account
70
+ 4111-0000-1111-0000 has a minimum payment of $24.53 that is due by July 31st.
71
+ Based on your autopay settings, we will withdraw your payment on the due date from
72
+ your bank account XXXXXX1111 with the routing number XXXXX0000.
73
+
74
+ Your latest statement was mailed to 100 Main Street, Anytown, WA 98121.
75
+ After your payment is received, you will receive a confirmation text message
76
+ at 206-555-0100.
77
+
78
+ If you have questions about your bill, AnyCompany Customer Service is available by
79
+ phone at 206-555-0199 or email at [email protected].
80
+ """
81
+
82
+ # Define the labels for PII/PHI entities
83
+ labels = [
84
+ "medical_record_number",
85
+ "date_of_birth",
86
+ "ssn",
87
+ "date",
88
+ "first_name",
89
+ "email",
90
+ "last_name",
91
+ "customer_id",
92
+ "employee_id",
93
+ "name",
94
+ "street_address",
95
+ "phone_number",
96
+ "ipv4",
97
+ "credit_card_number",
98
+ "license_plate",
99
+ "address",
100
+ "user_name",
101
+ "device_identifier",
102
+ "bank_routing_number",
103
+ "date_time",
104
+ "company_name",
105
+ "unique_identifier",
106
+ "biometric_identifier",
107
+ "account_number",
108
+ "city",
109
+ "certificate_license_number",
110
+ "time",
111
+ "postcode",
112
+ "vehicle_identifier",
113
+ "coordinate",
114
+ "country",
115
+ "api_key",
116
+ "ipv6",
117
+ "password",
118
+ "health_plan_beneficiary_number",
119
+ "national_id",
120
+ "tax_id",
121
+ "url",
122
+ "state",
123
+ "swift_bic",
124
+ "cvv",
125
+ "pin"
126
+ ]
127
+
128
+ st.write('Trying a sample first')
129
+ st.write(text)
130
+ # Predict entities with a confidence threshold of 0.7
131
+ # entities = model.predict_entities(text, labels, threshold=0.7)
132
+ entities = recognizer(text)
133
+
134
+ # Display the detected entities
135
+ for entity in entities:
136
+ st.write(entity)
137
+
138
+ st.write('Processing the full dataset now ...')
139
+ entity_set=dict()
140
+ dataset = load_dataset("Isotonic/pii-masking-200k", split="train")
141
+ unmasked_text = dataset['unmasked_text'] # This will load the entire column inmemory. Must do this to avoid I/O delay later
142
+
143
+ st.write('Size of the dataset ', dataset.num_rows)
144
+ sizes = [0] * 2
145
+ start = time.time()
146
+ t1 = threading.Thread(target=process_datasets, args=(0, 25, unmasked_text, sizes, 0, entity_set, []))
147
+ t2 = threading.Thread(target=process_datasets, args=(25, 50, unmasked_text, sizes, 1, entity_set, []))
148
+ with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
149
+ process_datasets(0, 50, unmasked_text, sizes, 0, entity_set, [])
150
+ # t1.start()
151
+ # t2.start()
152
+
153
+ # t1.join()
154
+ # t2.join()
155
+
156
+ end = time.time()
157
+ length = end - start
158
+
159
+ # Show the results : this can be altered however you like
160
+ st.write('Bytes processed ', sum(sizes))
161
+ st.write("It took", length, "seconds!")
162
+
163
+ # Display the summary
164
+ st.write('Total entities found')
165
+ for key in entity_set:
166
+ st.write(key, ' => ', entity_set[key])
167
+
168
+ st.write(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ datasets
2
+ gliner
3
+ peft
4
+ torch>=2.0.0
5
+ transformers>=4.38.2
6
+ huggingface_hub>=0.21.4
7
+ onnxruntime
8
+ sentencepiece
9
+ tqdm