Spaces:

CarolXia
/

pii-classification-kd

Sleeping

File size: 5,819 Bytes

import streamlit as st
# from gliner import GLiNER
from datasets import load_dataset
from peft import PeftModel, PeftConfig
import threading
import time
import torch
from torch.profiler import profile, record_function, ProfilerActivity
from transformers import DebertaV2ForTokenClassification, DebertaV2Tokenizer, pipeline

def predict_entities(text, labels, entity_set):
    if labels == []:
        entities = recognizer(text)
        for entity in entities:
            if entity['entity'] in entity_set:
                entity_set[entity['entity']] += 1
            else:
                entity_set[entity['entity']] = 1
    else:
        # Use Gliner labels
        entities = model.predict_entities(text, labels, threshold = 0.7)
        for entity in entities:
            if entity['label'] in entity_set:
                entity_set[entity['label']] += 1
            else:
                entity_set[entity['label']] = 1

def process_datasets(start, end, unmasked_text, sizes, index, entity_set, labels):
    size = 0
    text = ""
    for i in range(start, end):
        if len(text) < 700:
            text = text + " " + unmasked_text[i]
        else:
            size += len(text)
            predict_entities(text, labels, entity_set)
            
            text = unmasked_text[i]
    sizes[index] = size

print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")


# Load the fine-tuned GLiNER model
st.write('Loading the pretrained model ...')
model_name = "CarolXia/pii-kd-deberta-v2"
# config = PeftConfig.from_pretrained(model_name)
model = DebertaV2ForTokenClassification.from_pretrained(model_name, token=st.secrets["HUGGINGFACE_TOKEN"])
if torch.cuda.is_available():
    model = model.to("cuda")
# Try quantization instead
# model = AutoModelForTokenClassification.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/mdeberta-v3-base", token=st.secrets["HUGGINGFACE_TOKEN"])
recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
# model_name = "urchade/gliner_multi_pii-v1"
# model = GLiNER.from_pretrained(model_name)

# print weights
pytorch_total_params = sum(p.numel() for p in model.parameters())
torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')



# Sample text containing PII/PHI entities
text = """
Hello Jane Doe. Your AnyCompany Financial Services, LLC credit card account
4111-0000-1111-0000 has a minimum payment of $24.53 that is due by July 31st.
Based on your autopay settings, we will withdraw your payment on the due date from
your bank account XXXXXX1111 with the routing number XXXXX0000.

Your latest statement was mailed to 100 Main Street, Anytown, WA 98121.
After your payment is received, you will receive a confirmation text message
at 206-555-0100.

If you have questions about your bill, AnyCompany Customer Service is available by
phone at 206-555-0199 or email at [email protected].
"""

# Define the labels for PII/PHI entities
labels = [
    "medical_record_number",
    "date_of_birth",
    "ssn",
    "date",
    "first_name",
    "email",
    "last_name",
    "customer_id",
    "employee_id",
    "name",
    "street_address",
    "phone_number",
    "ipv4",
    "credit_card_number",
    "license_plate",
    "address",
    "user_name",
    "device_identifier",
    "bank_routing_number",
    "date_time",
    "company_name",
    "unique_identifier",
    "biometric_identifier",
    "account_number",
    "city",
    "certificate_license_number",
    "time",
    "postcode",
    "vehicle_identifier",
    "coordinate",
    "country",
    "api_key",
    "ipv6",
    "password",
    "health_plan_beneficiary_number",
    "national_id",
    "tax_id",
    "url",
    "state",
    "swift_bic",
    "cvv",
    "pin"
]

st.write('Trying a sample first')
st.write(text)
# Predict entities with a confidence threshold of 0.7
# entities = model.predict_entities(text, labels, threshold=0.7)
entities = recognizer(text) 

# Display the detected entities
for entity in entities:
    st.write(entity)

st.write('Processing the full dataset now ...')
entity_set=dict()
dataset = load_dataset("Isotonic/pii-masking-200k", split="train")
unmasked_text = dataset['unmasked_text'] # This will load the entire column inmemory. Must do this to avoid I/O delay later

st.write('Number of rows in the dataset ', dataset.num_rows)
sizes = [0] * 5
start = time.time()
t0 = threading.Thread(target=process_datasets, args=(0, 10, unmasked_text, sizes, 0, entity_set, []))
t1 = threading.Thread(target=process_datasets, args=(10, 20, unmasked_text, sizes, 1, entity_set, []))
t2 = threading.Thread(target=process_datasets, args=(20, 30, unmasked_text, sizes, 2, entity_set, []))
t3 = threading.Thread(target=process_datasets, args=(30, 40, unmasked_text, sizes, 3, entity_set, []))
t4 = threading.Thread(target=process_datasets, args=(40, 50, unmasked_text, sizes, 4, entity_set, []))
# with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
# process_datasets(0, 50, unmasked_text, sizes, 0, entity_set, [])
t0.start()
t1.start()
t2.start()
t3.start()
t4.start()

t0.join()
t1.join()
t2.join()
t3.join()
t4.join()

end = time.time()
length = end - start

# Show the results : this can be altered however you like
st.write('Bytes processed ', sum(sizes))
st.write("It took", length, "seconds!")

# Display the summary
st.write('Total entities found')
for key in entity_set:
    st.write(key, ' => ', entity_set[key])

st.write(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))