|
--- |
|
license: mit |
|
language: |
|
- en |
|
metrics: |
|
- accuracy |
|
tags: |
|
- IT |
|
- helpdesk |
|
- classifier |
|
- nlp |
|
- natural-language |
|
- classification |
|
--- |
|
<details> |
|
<summary> |
|
TinyBERT based model |
|
</summary> |
|
|
|
### Fetching the model |
|
|
|
```python |
|
import torch |
|
from torch.utils.data import DataLoader, Dataset |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW |
|
from sklearn.model_selection import train_test_split |
|
import pandas as pd |
|
from tqdm import tqdm |
|
|
|
# Load the TinyBERT tokenizer and model |
|
tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D') |
|
model = AutoModelForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2) |
|
|
|
# fetch the statedict to apply the fine-tuned weights |
|
state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/tiny_bert_model.bin") |
|
# if running on cpu |
|
# state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/tiny_bert_model.bin", map_location=torch.device('cpu')) |
|
|
|
model.load_state_dict(state_dict) |
|
|
|
model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) |
|
|
|
``` |
|
|
|
|
|
### Using the model |
|
|
|
```python |
|
def predict_description(model, tokenizer, text, max_length=512): |
|
model.eval() # Set the model to evaluation mode |
|
|
|
# Ensure model is on the correct device |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model = model.to(device) |
|
|
|
# Encode the input text |
|
inputs = tokenizer.encode_plus( |
|
text, |
|
None, |
|
add_special_tokens=True, |
|
max_length=max_length, |
|
padding='max_length', |
|
return_token_type_ids=False, |
|
return_tensors='pt', |
|
truncation=True |
|
) |
|
|
|
# Move tensors to the correct device |
|
inputs = {key: value.to(device) for key, value in inputs.items()} |
|
|
|
# Make prediction |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
probabilities = torch.softmax(logits, dim=-1) |
|
predicted_class_id = torch.argmax(probabilities, dim=-1).item() |
|
|
|
return predicted_class_id, probabilities.cpu().tolist() |
|
|
|
|
|
|
|
#Example usage |
|
|
|
tickets = [ |
|
"""Inquiry about the possibility of customizing Docker to better meet department-specific needs. |
|
Gathered requirements for desired customizations.""", |
|
"""We've encountered a recurring problem with DEVEnv shutting down anytime we try to save documents. |
|
I looked over the error logs for any clues about what's going wrong. I'm passing this on to the team responsible for software upkeep.""" |
|
] |
|
|
|
for row in tickets: |
|
prediction, probabilities = predict_description(model, tokenizer, row) |
|
prediction = (['INCIDENT', 'TASK'])[prediction] |
|
print(f"{prediction} ({probabilities}) <== {row['content']}") |
|
``` |
|
|
|
### Additional fine-tuning |
|
|
|
```python |
|
|
|
# The dataset class |
|
class TextDataset(Dataset): |
|
def __init__(self, descriptions, labels, tokenizer, max_len): |
|
self.descriptions = descriptions |
|
self.labels = labels |
|
self.tokenizer = tokenizer |
|
self.max_len = max_len |
|
|
|
def __len__(self): |
|
return len(self.descriptions) |
|
|
|
def __getitem__(self, idx): |
|
text = self.descriptions[idx] |
|
inputs = self.tokenizer.encode_plus( |
|
text, |
|
None, |
|
add_special_tokens=True, |
|
max_length=self.max_len, |
|
padding='max_length', |
|
return_token_type_ids=False, |
|
truncation=True |
|
) |
|
return { |
|
'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), |
|
'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long), |
|
'labels': torch.tensor(self.labels[idx], dtype=torch.long) |
|
} |
|
|
|
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |
|
# load the data |
|
df = pd.read_csv('..\\data\\final_data.csv') |
|
df['label'] = df['type'].astype('category').cat.codes # Convert labels to category codes if they aren't already |
|
|
|
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |
|
# create the training and validation sets and data loaders |
|
print( "cuda is available" if torch.cuda.is_available() else "cuda is unavailable: running on cpu") |
|
|
|
# Split the data into training and validation sets |
|
train_df, val_df = train_test_split(df, test_size=0.15) |
|
|
|
# Create PyTorch datasets |
|
train_dataset = TextDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer, max_len=512) |
|
val_dataset = TextDataset(val_df['content'].tolist(), val_df['label'].tolist(), tokenizer, max_len=512) |
|
|
|
# Create data loaders |
|
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) |
|
val_loader = DataLoader(val_dataset, batch_size=32) |
|
|
|
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |
|
# Train the model |
|
|
|
# only these layers will be trained, customize this to your liking to freeze the ones you dont want to retrain |
|
training_layers = [ |
|
"bert.encoder.layer.3.output.dense.weight", |
|
"bert.encoder.layer.3.output.dense.bias", |
|
"bert.encoder.layer.3.output.LayerNorm.weight", |
|
"bert.encoder.layer.3.output.LayerNorm.bias", |
|
"bert.pooler.dense.weight", |
|
"bert.pooler.dense.bias", |
|
"classifier.weight", |
|
"classifier.bias", |
|
] |
|
|
|
for name, param in model.named_parameters(): |
|
if name not in training_layers: # Freeze layers that are not part of the classifier |
|
param.requires_grad = False |
|
|
|
# Training setup |
|
optimizer = AdamW(model.parameters(), lr=5e-5) |
|
epochs = 2 |
|
|
|
for epoch in range(epochs): |
|
model.train() |
|
loss_item = float('+inf') |
|
for batch in tqdm(train_loader, desc=f"Training Loss: {loss_item}"): |
|
batch = {k: v.to(model.device) for k, v in batch.items()} |
|
outputs = model(**batch) |
|
loss = outputs.loss |
|
loss.backward() |
|
optimizer.step() |
|
optimizer.zero_grad() |
|
loss_item = loss.item() |
|
|
|
model.eval() |
|
total_eval_accuracy = 0 |
|
for batch in tqdm(val_loader, desc=f"Validation Accuracy: {total_eval_accuracy}"): |
|
batch = {k: v.to(model.device) for k, v in batch.items()} |
|
with torch.no_grad(): |
|
outputs = model(**batch) |
|
logits = outputs.logits |
|
predictions = torch.argmax(logits, dim=-1) |
|
accuracy = (predictions == batch['labels']).cpu().numpy().mean() |
|
total_eval_accuracy += accuracy |
|
|
|
print(f"Validation Accuracy: {total_eval_accuracy / len(val_loader)}") |
|
``` |
|
</details> |
|
|
|
|
|
<details> |
|
<summary> |
|
DistilBERT based model |
|
</summary> |
|
|
|
### Fetching the model |
|
|
|
```python |
|
import torch |
|
from torch.utils.data import DataLoader, Dataset |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW |
|
from sklearn.model_selection import train_test_split |
|
import pandas as pd |
|
from tqdm import tqdm |
|
|
|
# Load the TinyBERT tokenizer and model |
|
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased') |
|
model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2) |
|
|
|
# fetch the statedict to apply the fine-tuned weights |
|
state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/distilbert_1.bin") |
|
# if running on cpu |
|
# state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/distilbert_1.bin", map_location=torch.device('cpu')) |
|
|
|
model.load_state_dict(state_dict) |
|
|
|
model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) |
|
|
|
``` |
|
|
|
|
|
### Using the model |
|
|
|
```python |
|
def predict_description(model, tokenizer, text, max_length=512): |
|
model.eval() # Set the model to evaluation mode |
|
|
|
# Ensure model is on the correct device |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model = model.to(device) |
|
|
|
# Encode the input text |
|
inputs = tokenizer.encode_plus( |
|
text, |
|
None, |
|
add_special_tokens=True, |
|
max_length=max_length, |
|
padding='max_length', |
|
return_token_type_ids=False, |
|
return_tensors='pt', |
|
truncation=True |
|
) |
|
|
|
# Move tensors to the correct device |
|
inputs = {key: value.to(device) for key, value in inputs.items()} |
|
|
|
# Make prediction |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
probabilities = torch.softmax(logits, dim=-1) |
|
predicted_class_id = torch.argmax(probabilities, dim=-1).item() |
|
|
|
return predicted_class_id, probabilities.cpu().tolist() |
|
|
|
|
|
|
|
#Example usage |
|
|
|
tickets = [ |
|
"""Inquiry about the possibility of customizing Docker to better meet department-specific needs. |
|
Gathered requirements for desired customizations.""", |
|
"""We've encountered a recurring problem with DEVEnv shutting down anytime we try to save documents. |
|
I looked over the error logs for any clues about what's going wrong. I'm passing this on to the team responsible for software upkeep.""" |
|
] |
|
|
|
for row in tickets: |
|
prediction, probabilities = predict_description(model, tokenizer, row) |
|
prediction = (['INCIDENT', 'TASK'])[prediction] |
|
print(f"{prediction} ({probabilities}) <== {row['content']}") |
|
``` |
|
|
|
### Additional fine-tuning |
|
|
|
```python |
|
|
|
# The dataset class |
|
class TextDataset(Dataset): |
|
def __init__(self, descriptions, labels, tokenizer, max_len): |
|
self.descriptions = descriptions |
|
self.labels = labels |
|
self.tokenizer = tokenizer |
|
self.max_len = max_len |
|
|
|
def __len__(self): |
|
return len(self.descriptions) |
|
|
|
def __getitem__(self, idx): |
|
text = self.descriptions[idx] |
|
inputs = self.tokenizer.encode_plus( |
|
text, |
|
None, |
|
add_special_tokens=True, |
|
max_length=self.max_len, |
|
padding='max_length', |
|
return_token_type_ids=False, |
|
truncation=True |
|
) |
|
return { |
|
'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), |
|
'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long), |
|
'labels': torch.tensor(self.labels[idx], dtype=torch.long) |
|
} |
|
|
|
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |
|
# load the data |
|
df = pd.read_csv('..\\data\\final_data.csv') |
|
df['label'] = df['type'].astype('category').cat.codes # Convert labels to category codes if they aren't already |
|
|
|
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |
|
# create the training and validation sets and data loaders |
|
print( "cuda is available" if torch.cuda.is_available() else "cuda is unavailable: running on cpu") |
|
|
|
# Split the data into training and validation sets |
|
train_df, val_df = train_test_split(df, test_size=0.15) |
|
|
|
# Create PyTorch datasets |
|
train_dataset = TextDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer, max_len=512) |
|
val_dataset = TextDataset(val_df['content'].tolist(), val_df['label'].tolist(), tokenizer, max_len=512) |
|
|
|
# Create data loaders |
|
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) |
|
val_loader = DataLoader(val_dataset, batch_size=32) |
|
|
|
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |
|
# Train the model |
|
|
|
# only these layers will be trained, customize this to your liking to freeze the ones you dont want to retrain |
|
training_layers = [ |
|
"distilbert.transformer.layer.5.ffn.lin2.weight", |
|
"distilbert.transformer.layer.5.ffn.lin2.bias", |
|
"distilbert.transformer.layer.5.output_layer_norm.weight", |
|
"distilbert.transformer.layer.5.output_layer_norm.bias", |
|
"pre_classifier.weight", |
|
"pre_classifier.bias", |
|
"classifier.weight", |
|
"classifier.bias" |
|
] |
|
|
|
for name, param in model.named_parameters(): |
|
if name not in training_layers: # Freeze layers that are not part of the classifier |
|
param.requires_grad = False |
|
|
|
# if the model is not already on gpu, make sure to train it on gpu if available |
|
# model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) |
|
|
|
# Training setup |
|
optimizer = AdamW(model.parameters(), lr=5e-5) |
|
epochs = 2 |
|
|
|
for epoch in range(epochs): |
|
model.train() |
|
loss_item = float('+inf') |
|
for batch in tqdm(train_loader, desc=f"Training Loss: {loss_item}"): |
|
batch = {k: v.to(model.device) for k, v in batch.items()} |
|
outputs = model(**batch) |
|
loss = outputs.loss |
|
loss.backward() |
|
optimizer.step() |
|
optimizer.zero_grad() |
|
loss_item = loss.item() |
|
|
|
model.eval() |
|
total_eval_accuracy = 0 |
|
for batch in tqdm(val_loader, desc=f"Validation Accuracy: {total_eval_accuracy}"): |
|
batch = {k: v.to(model.device) for k, v in batch.items()} |
|
with torch.no_grad(): |
|
outputs = model(**batch) |
|
logits = outputs.logits |
|
predictions = torch.argmax(logits, dim=-1) |
|
accuracy = (predictions == batch['labels']).cpu().numpy().mean() |
|
total_eval_accuracy += accuracy |
|
|
|
print(f"Validation Accuracy: {total_eval_accuracy / len(val_loader)}") |
|
``` |
|
</details> |
|
|
|
<details> |
|
<summary>RoBERT based model</summary> |
|
|
|
### Base model |
|
```python |
|
import torch |
|
from torch.utils.data import DataLoader, Dataset |
|
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW |
|
from sklearn.model_selection import train_test_split |
|
import pandas as pd |
|
|
|
# Load the tokenizer |
|
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') |
|
|
|
# Load RoBERTa pre-trained model |
|
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2) |
|
|
|
|
|
# fetch the statedict to apply the fine-tuned weights |
|
state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/pytorch_model.bin") |
|
# if running on cpu |
|
# state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/pytorch_model.bin", map_location=torch.device('cpu')) |
|
|
|
model.load_state_dict(state_dict) |
|
|
|
model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) |
|
|
|
|
|
``` |
|
|
|
### Use model to make predictions |
|
```python |
|
|
|
def predict_description(model, tokenizer, text, max_length=512): |
|
model.eval() # Set the model to evaluation mode |
|
|
|
# Ensure model is on the correct device |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model = model.to(device) |
|
|
|
# Encode the input text |
|
inputs = tokenizer.encode_plus( |
|
text, |
|
None, |
|
add_special_tokens=True, |
|
max_length=max_length, |
|
padding='max_length', |
|
return_token_type_ids=False, |
|
return_tensors='pt', |
|
truncation=True |
|
) |
|
|
|
# Move tensors to the correct device |
|
inputs = {key: value.to(device) for key, value in inputs.items()} |
|
|
|
# Make prediction |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
probabilities = torch.softmax(logits, dim=-1) |
|
predicted_class_id = torch.argmax(probabilities, dim=-1).item() |
|
|
|
return predicted_class_id |
|
|
|
|
|
(['INCIDENT', 'REQUEST'])[predict_description(model, tokenizer, """My ID card is not being detected.""")] |
|
|
|
``` |
|
</details> |