KameronB commited on
Commit
33fcb86
1 Parent(s): 3807e39

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +188 -0
README.md CHANGED
@@ -187,6 +187,194 @@ for epoch in range(epochs):
187
  ```
188
  </details>
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  <details>
191
  <summary>RoBERT based model</summary>
192
 
 
187
  ```
188
  </details>
189
 
190
+
191
+ <details>
192
+ <summary>
193
+ DistilBERT based model
194
+ </summary>
195
+
196
+ ### Fetching the model
197
+
198
+ ```python
199
+ import torch
200
+ from torch.utils.data import DataLoader, Dataset
201
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
202
+ from sklearn.model_selection import train_test_split
203
+ import pandas as pd
204
+ from tqdm import tqdm
205
+
206
+ # Load the TinyBERT tokenizer and model
207
+ tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
208
+ model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2)
209
+
210
+ # fetch the statedict to apply the fine-tuned weights
211
+ state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/distilbert_1.bin")
212
+ # if running on cpu
213
+ # state_dict = torch.hub.load_state_dict_from_url(f"https://huggingface.co/KameronB/SITCC-Incident-Request-Classifier/resolve/main/distilbert_1.bin", map_location=torch.device('cpu'))
214
+
215
+ model.load_state_dict(state_dict)
216
+
217
+ model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
218
+
219
+ ```
220
+
221
+
222
+ ### Using the model
223
+
224
+ ```python
225
+ def predict_description(model, tokenizer, text, max_length=512):
226
+ model.eval() # Set the model to evaluation mode
227
+
228
+ # Ensure model is on the correct device
229
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
230
+ model = model.to(device)
231
+
232
+ # Encode the input text
233
+ inputs = tokenizer.encode_plus(
234
+ text,
235
+ None,
236
+ add_special_tokens=True,
237
+ max_length=max_length,
238
+ padding='max_length',
239
+ return_token_type_ids=False,
240
+ return_tensors='pt',
241
+ truncation=True
242
+ )
243
+
244
+ # Move tensors to the correct device
245
+ inputs = {key: value.to(device) for key, value in inputs.items()}
246
+
247
+ # Make prediction
248
+ with torch.no_grad():
249
+ outputs = model(**inputs)
250
+ logits = outputs.logits
251
+ probabilities = torch.softmax(logits, dim=-1)
252
+ predicted_class_id = torch.argmax(probabilities, dim=-1).item()
253
+
254
+ return predicted_class_id, probabilities.cpu().tolist()
255
+
256
+
257
+
258
+ #Example usage
259
+
260
+ tickets = [
261
+ """Inquiry about the possibility of customizing Docker to better meet department-specific needs.
262
+ Gathered requirements for desired customizations.""",
263
+ """We've encountered a recurring problem with DEVEnv shutting down anytime we try to save documents.
264
+ I looked over the error logs for any clues about what's going wrong. I'm passing this on to the team responsible for software upkeep."""
265
+ ]
266
+
267
+ for row in tickets:
268
+ prediction, probabilities = predict_description(model, tokenizer, row)
269
+ prediction = (['INCIDENT', 'TASK'])[prediction]
270
+ print(f"{prediction} ({probabilities}) <== {row['content']}")
271
+ ```
272
+
273
+ ### Additional fine-tuning
274
+
275
+ ```python
276
+
277
+ # The dataset class
278
+ class TextDataset(Dataset):
279
+ def __init__(self, descriptions, labels, tokenizer, max_len):
280
+ self.descriptions = descriptions
281
+ self.labels = labels
282
+ self.tokenizer = tokenizer
283
+ self.max_len = max_len
284
+
285
+ def __len__(self):
286
+ return len(self.descriptions)
287
+
288
+ def __getitem__(self, idx):
289
+ text = self.descriptions[idx]
290
+ inputs = self.tokenizer.encode_plus(
291
+ text,
292
+ None,
293
+ add_special_tokens=True,
294
+ max_length=self.max_len,
295
+ padding='max_length',
296
+ return_token_type_ids=False,
297
+ truncation=True
298
+ )
299
+ return {
300
+ 'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
301
+ 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
302
+ 'labels': torch.tensor(self.labels[idx], dtype=torch.long)
303
+ }
304
+
305
+ # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
306
+ # load the data
307
+ df = pd.read_csv('..\\data\\final_data.csv')
308
+ df['label'] = df['type'].astype('category').cat.codes # Convert labels to category codes if they aren't already
309
+
310
+ # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
311
+ # create the training and validation sets and data loaders
312
+ print( "cuda is available" if torch.cuda.is_available() else "cuda is unavailable: running on cpu")
313
+
314
+ # Split the data into training and validation sets
315
+ train_df, val_df = train_test_split(df, test_size=0.15)
316
+
317
+ # Create PyTorch datasets
318
+ train_dataset = TextDataset(train_df['content'].tolist(), train_df['label'].tolist(), tokenizer, max_len=512)
319
+ val_dataset = TextDataset(val_df['content'].tolist(), val_df['label'].tolist(), tokenizer, max_len=512)
320
+
321
+ # Create data loaders
322
+ train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
323
+ val_loader = DataLoader(val_dataset, batch_size=32)
324
+
325
+ # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
326
+ # Train the model
327
+
328
+ # only these layers will be trained, customize this to your liking to freeze the ones you dont want to retrain
329
+ training_layers = [
330
+ "distilbert.transformer.layer.5.ffn.lin2.weight",
331
+ "distilbert.transformer.layer.5.ffn.lin2.bias",
332
+ "distilbert.transformer.layer.5.output_layer_norm.weight",
333
+ "distilbert.transformer.layer.5.output_layer_norm.bias",
334
+ "pre_classifier.weight",
335
+ "pre_classifier.bias",
336
+ "classifier.weight",
337
+ "classifier.bias"
338
+ ]
339
+
340
+ for name, param in model.named_parameters():
341
+ if name not in training_layers: # Freeze layers that are not part of the classifier
342
+ param.requires_grad = False
343
+
344
+ # if the model is not already on gpu, make sure to train it on gpu if available
345
+ # model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
346
+
347
+ # Training setup
348
+ optimizer = AdamW(model.parameters(), lr=5e-5)
349
+ epochs = 2
350
+
351
+ for epoch in range(epochs):
352
+ model.train()
353
+ loss_item = float('+inf')
354
+ for batch in tqdm(train_loader, desc=f"Training Loss: {loss_item}"):
355
+ batch = {k: v.to(model.device) for k, v in batch.items()}
356
+ outputs = model(**batch)
357
+ loss = outputs.loss
358
+ loss.backward()
359
+ optimizer.step()
360
+ optimizer.zero_grad()
361
+ loss_item = loss.item()
362
+
363
+ model.eval()
364
+ total_eval_accuracy = 0
365
+ for batch in tqdm(val_loader, desc=f"Validation Accuracy: {total_eval_accuracy}"):
366
+ batch = {k: v.to(model.device) for k, v in batch.items()}
367
+ with torch.no_grad():
368
+ outputs = model(**batch)
369
+ logits = outputs.logits
370
+ predictions = torch.argmax(logits, dim=-1)
371
+ accuracy = (predictions == batch['labels']).cpu().numpy().mean()
372
+ total_eval_accuracy += accuracy
373
+
374
+ print(f"Validation Accuracy: {total_eval_accuracy / len(val_loader)}")
375
+ ```
376
+ </details>
377
+
378
  <details>
379
  <summary>RoBERT based model</summary>
380