File size: 5,819 Bytes
0dd2059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bb500b
 
0dd2059
 
c811912
0dd2059
 
 
 
 
 
 
 
 
190b031
0dd2059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bb500b
 
0dd2059
7961c58
 
 
 
 
8bb500b
7961c58
 
 
 
 
 
 
 
 
 
 
 
0dd2059
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import streamlit as st
# from gliner import GLiNER
from datasets import load_dataset
from peft import PeftModel, PeftConfig
import threading
import time
import torch
from torch.profiler import profile, record_function, ProfilerActivity
from transformers import DebertaV2ForTokenClassification, DebertaV2Tokenizer, pipeline

def predict_entities(text, labels, entity_set):
    if labels == []:
        entities = recognizer(text)
        for entity in entities:
            if entity['entity'] in entity_set:
                entity_set[entity['entity']] += 1
            else:
                entity_set[entity['entity']] = 1
    else:
        # Use Gliner labels
        entities = model.predict_entities(text, labels, threshold = 0.7)
        for entity in entities:
            if entity['label'] in entity_set:
                entity_set[entity['label']] += 1
            else:
                entity_set[entity['label']] = 1

def process_datasets(start, end, unmasked_text, sizes, index, entity_set, labels):
    size = 0
    text = ""
    for i in range(start, end):
        if len(text) < 700:
            text = text + " " + unmasked_text[i]
        else:
            size += len(text)
            predict_entities(text, labels, entity_set)
            
            text = unmasked_text[i]
    sizes[index] = size

print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")


# Load the fine-tuned GLiNER model
st.write('Loading the pretrained model ...')
model_name = "CarolXia/pii-kd-deberta-v2"
# config = PeftConfig.from_pretrained(model_name)
model = DebertaV2ForTokenClassification.from_pretrained(model_name, token=st.secrets["HUGGINGFACE_TOKEN"])
if torch.cuda.is_available():
    model = model.to("cuda")
# Try quantization instead
# model = AutoModelForTokenClassification.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/mdeberta-v3-base", token=st.secrets["HUGGINGFACE_TOKEN"])
recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
# model_name = "urchade/gliner_multi_pii-v1"
# model = GLiNER.from_pretrained(model_name)

# print weights
pytorch_total_params = sum(p.numel() for p in model.parameters())
torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')



# Sample text containing PII/PHI entities
text = """
Hello Jane Doe. Your AnyCompany Financial Services, LLC credit card account
4111-0000-1111-0000 has a minimum payment of $24.53 that is due by July 31st.
Based on your autopay settings, we will withdraw your payment on the due date from
your bank account XXXXXX1111 with the routing number XXXXX0000.

Your latest statement was mailed to 100 Main Street, Anytown, WA 98121.
After your payment is received, you will receive a confirmation text message
at 206-555-0100.

If you have questions about your bill, AnyCompany Customer Service is available by
phone at 206-555-0199 or email at [email protected].
"""

# Define the labels for PII/PHI entities
labels = [
    "medical_record_number",
    "date_of_birth",
    "ssn",
    "date",
    "first_name",
    "email",
    "last_name",
    "customer_id",
    "employee_id",
    "name",
    "street_address",
    "phone_number",
    "ipv4",
    "credit_card_number",
    "license_plate",
    "address",
    "user_name",
    "device_identifier",
    "bank_routing_number",
    "date_time",
    "company_name",
    "unique_identifier",
    "biometric_identifier",
    "account_number",
    "city",
    "certificate_license_number",
    "time",
    "postcode",
    "vehicle_identifier",
    "coordinate",
    "country",
    "api_key",
    "ipv6",
    "password",
    "health_plan_beneficiary_number",
    "national_id",
    "tax_id",
    "url",
    "state",
    "swift_bic",
    "cvv",
    "pin"
]

st.write('Trying a sample first')
st.write(text)
# Predict entities with a confidence threshold of 0.7
# entities = model.predict_entities(text, labels, threshold=0.7)
entities = recognizer(text) 

# Display the detected entities
for entity in entities:
    st.write(entity)

st.write('Processing the full dataset now ...')
entity_set=dict()
dataset = load_dataset("Isotonic/pii-masking-200k", split="train")
unmasked_text = dataset['unmasked_text'] # This will load the entire column inmemory. Must do this to avoid I/O delay later

st.write('Number of rows in the dataset ', dataset.num_rows)
sizes = [0] * 5
start = time.time()
t0 = threading.Thread(target=process_datasets, args=(0, 10, unmasked_text, sizes, 0, entity_set, []))
t1 = threading.Thread(target=process_datasets, args=(10, 20, unmasked_text, sizes, 1, entity_set, []))
t2 = threading.Thread(target=process_datasets, args=(20, 30, unmasked_text, sizes, 2, entity_set, []))
t3 = threading.Thread(target=process_datasets, args=(30, 40, unmasked_text, sizes, 3, entity_set, []))
t4 = threading.Thread(target=process_datasets, args=(40, 50, unmasked_text, sizes, 4, entity_set, []))
# with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
# process_datasets(0, 50, unmasked_text, sizes, 0, entity_set, [])
t0.start()
t1.start()
t2.start()
t3.start()
t4.start()

t0.join()
t1.join()
t2.join()
t3.join()
t4.join()

end = time.time()
length = end - start

# Show the results : this can be altered however you like
st.write('Bytes processed ', sum(sizes))
st.write("It took", length, "seconds!")

# Display the summary
st.write('Total entities found')
for key in entity_set:
    st.write(key, ' => ', entity_set[key])

st.write(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))