In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
!unzip jigsaw-toxic-comment-classification-challenge

In [None]:
!unzip test.csv.zip  
!unzip test_labels.csv.zip  
!unzip train.csv.zip

In [1]:
import warnings
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# Use GPU
#device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = "mps:0" if torch.backends.mps.is_available() else "cpu"
print(device)

mps:0


In [4]:
# Load training text and label dataset
# Preprocess data

#test_texts = pd.read_csv("test.csv").values.tolist()
#test_labels = pd.read_csv('test_labels.csv').values.tolist()

train = pd.read_csv('train.csv')
train.head(1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0


In [5]:
# Any duplicates?
print(len(train['comment_text'].unique()), train.shape)

# Any missing values?
print(train.isnull().any())
print(train.isnull().values.any())

159571 (159571, 8)
id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool
False


In [6]:
# Group labels to get right format for training
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['grouped_labels'] = train[labels].values.tolist()
train.head(1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,grouped_labels
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [7]:
# Convert to list from dataframe
train_texts = train['comment_text'].values.tolist()
train_labels = train['grouped_labels'].values.tolist()

In [8]:
# Use distilbert, a faster model of BERT which keeps 95% of the performance
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
# Also do preprocessing to see if there are any unique rows
# with that specfic combination of labels
# If that is the case, we want to include that row in the training data

# Find unique label combinations
label_counts = train['grouped_labels'].astype(str).value_counts()
print(label_counts[-14:])

# Take low frequency labels
low_freq = label_counts[label_counts<10].keys()
low_freq_inds = sorted(list(train[train['grouped_labels'].astype(str).isin(low_freq)].index), reverse=True)
print('df label indices with only one instance: ', low_freq_inds)

[1, 0, 1, 1, 0, 0]    11
[1, 1, 0, 1, 0, 0]    11
[1, 0, 0, 1, 0, 1]     7
[1, 1, 0, 0, 1, 1]     7
[1, 1, 1, 0, 0, 1]     6
[1, 1, 1, 1, 0, 0]     4
[0, 0, 0, 1, 1, 0]     3
[1, 0, 0, 1, 1, 1]     3
[1, 1, 0, 0, 0, 1]     3
[0, 0, 1, 0, 0, 1]     3
[0, 0, 1, 1, 0, 0]     2
[0, 0, 1, 1, 1, 0]     2
[1, 1, 0, 1, 1, 0]     1
[1, 1, 0, 1, 0, 1]     1
Name: grouped_labels, dtype: int64
df label indices with only one instance:  [159029, 158498, 157010, 154553, 149180, 144159, 139501, 138026, 134459, 133505, 127410, 120395, 115766, 113304, 110056, 107881, 107096, 101089, 98699, 86746, 76454, 74607, 68264, 66350, 63687, 61934, 57594, 53408, 45101, 41461, 36141, 31191, 30566, 29445, 23374, 17187, 15977, 9487, 8979, 6316, 6063, 2374]


In [10]:
low_freq_train_texts = [train_texts.pop(i) for i in low_freq_inds]
low_freq_train_labels = [train_labels.pop(i) for i in low_freq_inds]

In [11]:
# Add low freq values to training data
train_texts.extend(low_freq_train_texts)
train_labels.extend(low_freq_train_labels)

In [12]:
# Split datasets for training
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1)

In [13]:
# Shorten token to increase training speed, average is below this
max_length = 100
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt", max_length=max_length).to(device)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt", max_length=max_length).to(device)

In [14]:
class ToxicDataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = [[float(y) for y in x] for x in labels]

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [15]:
train_dataset = ToxicDataset(train_encodings, train_labels)
val_dataset = ToxicDataset(val_encodings, val_labels)

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=6,
                                                          ).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [56]:
class TrainingArgumentsWithMPSSupport(TrainingArguments):
    @property
    def device(self) -> torch.device:
        if torch.backends.mps.is_available():
            return torch.device("mps")
        else:
            return torch.device("cpu")

training_args = TrainingArgumentsWithMPSSupport(
    output_dir = './results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
***** Running training *****
  Num examples = 127656
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7979
  Number of trainable parameters = 109486854


Step,Training Loss
10,0.6058
20,0.5901
30,0.5502


KeyboardInterrupt: 

In [21]:
trainer.save_model('./model_checkpoint/done')

Saving model checkpoint to ./model_checkpoint/done
Configuration saved in ./model_checkpoint/done/config.json
Model weights saved in ./model_checkpoint/done/pytorch_model.bin


In [18]:
from transformers import BertTokenizer, BertForSequenceClassification
#saved = DistilBertModel.from_pretrained('./model_checkpoint/trained', num_labels=6, problem_type="multi_label_classification")
saved = BertForSequenceClassification.from_pretrained('./model_checkpoint/fine_tuned')

In [19]:
trainer.evaluate()

NameError: name 'trainer' is not defined

In [59]:
text = "fun"
encoded_input = tokenizer(text, return_tensors="pt")
outputs = saved(**encoded_input)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions.tolist()

[[0.4601849317550659,
  0.0626736581325531,
  0.1962047964334488,
  0.0715285912156105,
  0.1363525241613388,
  0.0730554461479187]]

In [48]:
res = [1 if x >= 0.5 else 0 for x in predictions[0]]

In [49]:
res

[1, 0, 0, 0, 0, 0]