Update README.md

77b2a83 verified 7 months ago

15.2 kB

	---
	license: mit
	language:
	- en
	metrics:
	- accuracy
	tags:
	- IT
	- helpdesk
	- classifier
	- nlp
	- natural-language
	- classification
	---
	<details>
	<summary>
	TinyBERT based model
	</summary>

	### Fetching the model

	```python
	import torch
	from torch.utils.data import DataLoader, Dataset
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
	from sklearn.model_selection import train_test_split
	import pandas as pd
	from tqdm import tqdm

	# Load the TinyBERT tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
	model = AutoModelForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2)

	# fetch the statedict to apply the fine-tuned weights
	state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/tiny_bert_model.bin")
	# if running on cpu
	# state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/tiny_bert_model.bin", map_location=torch.device('cpu'))

	model.load_state_dict(state_dict)

	model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

	```


	### Using the model

	```python
	def predict_description(model, tokenizer, text, max_length=512):
	model.eval() # Set the model to evaluation mode

	# Ensure model is on the correct device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)

	# Encode the input text
	inputs = tokenizer.encode_plus(
	text,
	None,
	add_special_tokens=True,
	max_length=max_length,
	padding='max_length',
	return_token_type_ids=False,
	return_tensors='pt',
	truncation=True
	)

	# Move tensors to the correct device
	inputs = {key: value.to(device) for key, value in inputs.items()}

	# Make prediction
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	probabilities = torch.softmax(logits, dim=-1)
	predicted_class_id = torch.argmax(probabilities, dim=-1).item()

	return predicted_class_id, probabilities.cpu().tolist()



	#Example usage

	tickets = [
	"""Inquiry about the possibility of customizing Docker to better meet department-specific needs.
	Gathered requirements for desired customizations.""",
	"""We've encountered a recurring problem with DEVEnv shutting down anytime we try to save documents.
	I looked over the error logs for any clues about what's going wrong. I'm passing this on to the team responsible for software upkeep."""
	]

	for row in tickets:
	prediction, probabilities = predict_description(model, tokenizer, row)
	prediction = (['INCIDENT', 'TASK'])[prediction]
	print(f"{prediction} ({probabilities}) <== {row['content']}")
	```

	### Additional fine-tuning

	```python

	# The dataset class
	class TextDataset(Dataset):
	def __init__(self, descriptions, labels, tokenizer, max_len):
	self.descriptions = descriptions
	self.labels = labels
	self.tokenizer = tokenizer
	self.max_len = max_len

	def __len__(self):
	return len(self.descriptions)

	def __getitem__(self, idx):
	text = self.descriptions[idx]
	inputs = self.tokenizer.encode_plus(
	text,
	None,
	add_special_tokens=True,
	max_length=self.max_len,
	padding='max_length',
	return_token_type_ids=False,
	truncation=True
	)
	return {
	'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
	'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
	'labels': torch.tensor(self.labels[idx], dtype=torch.long)
	}

	# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
	# load the data
	df = pd.read_csv('..\\data\\final_data.csv')
	df['label'] = df['type'].astype('category').cat.codes # Convert labels to category codes if they aren't already

	# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
	# create the training and validation sets and data loaders
	print( "cuda is available" if torch.cuda.is_available() else "cuda is unavailable: running on cpu")

	# Split the data into training and validation sets
	train_df, val_df = train_test_split(df, test_size=0.15)

	# Create PyTorch datasets
	train_dataset = TextDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer, max_len=512)
	val_dataset = TextDataset(val_df['content'].tolist(), val_df['label'].tolist(), tokenizer, max_len=512)

	# Create data loaders
	train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
	val_loader = DataLoader(val_dataset, batch_size=32)

	# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
	# Train the model

	# only these layers will be trained, customize this to your liking to freeze the ones you dont want to retrain
	training_layers = [
	"bert.encoder.layer.3.output.dense.weight",
	"bert.encoder.layer.3.output.dense.bias",
	"bert.encoder.layer.3.output.LayerNorm.weight",
	"bert.encoder.layer.3.output.LayerNorm.bias",
	"bert.pooler.dense.weight",
	"bert.pooler.dense.bias",
	"classifier.weight",
	"classifier.bias",
	]

	for name, param in model.named_parameters():
	if name not in training_layers: # Freeze layers that are not part of the classifier
	param.requires_grad = False

	# Training setup
	optimizer = AdamW(model.parameters(), lr=5e-5)
	epochs = 2

	for epoch in range(epochs):
	model.train()
	loss_item = float('+inf')
	for batch in tqdm(train_loader, desc=f"Training Loss: {loss_item}"):
	batch = {k: v.to(model.device) for k, v in batch.items()}
	outputs = model(**batch)
	loss = outputs.loss
	loss.backward()
	optimizer.step()
	optimizer.zero_grad()
	loss_item = loss.item()

	model.eval()
	total_eval_accuracy = 0
	for batch in tqdm(val_loader, desc=f"Validation Accuracy: {total_eval_accuracy}"):
	batch = {k: v.to(model.device) for k, v in batch.items()}
	with torch.no_grad():
	outputs = model(**batch)
	logits = outputs.logits
	predictions = torch.argmax(logits, dim=-1)
	accuracy = (predictions == batch['labels']).cpu().numpy().mean()
	total_eval_accuracy += accuracy

	print(f"Validation Accuracy: {total_eval_accuracy / len(val_loader)}")
	```
	</details>


	<details>
	<summary>
	DistilBERT based model
	</summary>

	### Fetching the model

	```python
	import torch
	from torch.utils.data import DataLoader, Dataset
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
	from sklearn.model_selection import train_test_split
	import pandas as pd
	from tqdm import tqdm

	# Load the TinyBERT tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
	model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2)

	# fetch the statedict to apply the fine-tuned weights
	state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/distilbert_1.bin")
	# if running on cpu
	# state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/distilbert_1.bin", map_location=torch.device('cpu'))

	model.load_state_dict(state_dict)

	model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

	```


	### Using the model

	```python
	def predict_description(model, tokenizer, text, max_length=512):
	model.eval() # Set the model to evaluation mode

	# Ensure model is on the correct device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)

	# Encode the input text
	inputs = tokenizer.encode_plus(
	text,
	None,
	add_special_tokens=True,
	max_length=max_length,
	padding='max_length',
	return_token_type_ids=False,
	return_tensors='pt',
	truncation=True
	)

	# Move tensors to the correct device
	inputs = {key: value.to(device) for key, value in inputs.items()}

	# Make prediction
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	probabilities = torch.softmax(logits, dim=-1)
	predicted_class_id = torch.argmax(probabilities, dim=-1).item()

	return predicted_class_id, probabilities.cpu().tolist()



	#Example usage

	tickets = [
	"""Inquiry about the possibility of customizing Docker to better meet department-specific needs.
	Gathered requirements for desired customizations.""",
	"""We've encountered a recurring problem with DEVEnv shutting down anytime we try to save documents.
	I looked over the error logs for any clues about what's going wrong. I'm passing this on to the team responsible for software upkeep."""
	]

	for row in tickets:
	prediction, probabilities = predict_description(model, tokenizer, row)
	prediction = (['INCIDENT', 'TASK'])[prediction]
	print(f"{prediction} ({probabilities}) <== {row['content']}")
	```

	### Additional fine-tuning

	```python

	# The dataset class
	class TextDataset(Dataset):
	def __init__(self, descriptions, labels, tokenizer, max_len):
	self.descriptions = descriptions
	self.labels = labels
	self.tokenizer = tokenizer
	self.max_len = max_len

	def __len__(self):
	return len(self.descriptions)

	def __getitem__(self, idx):
	text = self.descriptions[idx]
	inputs = self.tokenizer.encode_plus(
	text,
	None,
	add_special_tokens=True,
	max_length=self.max_len,
	padding='max_length',
	return_token_type_ids=False,
	truncation=True
	)
	return {
	'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
	'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
	'labels': torch.tensor(self.labels[idx], dtype=torch.long)
	}

	# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
	# load the data
	df = pd.read_csv('..\\data\\final_data.csv')
	df['label'] = df['type'].astype('category').cat.codes # Convert labels to category codes if they aren't already

	# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
	# create the training and validation sets and data loaders
	print( "cuda is available" if torch.cuda.is_available() else "cuda is unavailable: running on cpu")

	# Split the data into training and validation sets
	train_df, val_df = train_test_split(df, test_size=0.15)

	# Create PyTorch datasets
	train_dataset = TextDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer, max_len=512)
	val_dataset = TextDataset(val_df['content'].tolist(), val_df['label'].tolist(), tokenizer, max_len=512)

	# Create data loaders
	train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
	val_loader = DataLoader(val_dataset, batch_size=32)

	# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
	# Train the model

	# only these layers will be trained, customize this to your liking to freeze the ones you dont want to retrain
	training_layers = [
	"distilbert.transformer.layer.5.ffn.lin2.weight",
	"distilbert.transformer.layer.5.ffn.lin2.bias",
	"distilbert.transformer.layer.5.output_layer_norm.weight",
	"distilbert.transformer.layer.5.output_layer_norm.bias",
	"pre_classifier.weight",
	"pre_classifier.bias",
	"classifier.weight",
	"classifier.bias"
	]

	for name, param in model.named_parameters():
	if name not in training_layers: # Freeze layers that are not part of the classifier
	param.requires_grad = False

	# if the model is not already on gpu, make sure to train it on gpu if available
	# model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

	# Training setup
	optimizer = AdamW(model.parameters(), lr=5e-5)
	epochs = 2

	for epoch in range(epochs):
	model.train()
	loss_item = float('+inf')
	for batch in tqdm(train_loader, desc=f"Training Loss: {loss_item}"):
	batch = {k: v.to(model.device) for k, v in batch.items()}
	outputs = model(**batch)
	loss = outputs.loss
	loss.backward()
	optimizer.step()
	optimizer.zero_grad()
	loss_item = loss.item()

	model.eval()
	total_eval_accuracy = 0
	for batch in tqdm(val_loader, desc=f"Validation Accuracy: {total_eval_accuracy}"):
	batch = {k: v.to(model.device) for k, v in batch.items()}
	with torch.no_grad():
	outputs = model(**batch)
	logits = outputs.logits
	predictions = torch.argmax(logits, dim=-1)
	accuracy = (predictions == batch['labels']).cpu().numpy().mean()
	total_eval_accuracy += accuracy

	print(f"Validation Accuracy: {total_eval_accuracy / len(val_loader)}")
	```
	</details>

	<details>
	<summary>RoBERT based model</summary>

	### Base model
	```python
	import torch
	from torch.utils.data import DataLoader, Dataset
	from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
	from sklearn.model_selection import train_test_split
	import pandas as pd

	# Load the tokenizer
	tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

	# Load RoBERTa pre-trained model
	model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)


	# fetch the statedict to apply the fine-tuned weights
	state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/pytorch_model.bin")
	# if running on cpu
	# state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/pytorch_model.bin", map_location=torch.device('cpu'))

	model.load_state_dict(state_dict)

	model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


	```

	### Use model to make predictions
	```python

	def predict_description(model, tokenizer, text, max_length=512):
	model.eval() # Set the model to evaluation mode

	# Ensure model is on the correct device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)

	# Encode the input text
	inputs = tokenizer.encode_plus(
	text,
	None,
	add_special_tokens=True,
	max_length=max_length,
	padding='max_length',
	return_token_type_ids=False,
	return_tensors='pt',
	truncation=True
	)

	# Move tensors to the correct device
	inputs = {key: value.to(device) for key, value in inputs.items()}

	# Make prediction
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	probabilities = torch.softmax(logits, dim=-1)
	predicted_class_id = torch.argmax(probabilities, dim=-1).item()

	return predicted_class_id


	(['INCIDENT', 'REQUEST'])[predict_description(model, tokenizer, """My ID card is not being detected.""")]

	```
	</details>