In [None]:
# --- INSTALLATION ---

!pip install pandas numpy matplotlib nltk scikit-learn transformers datasets torch
!kaggle datasets download -d shanegerami/ai-vs-human-text
!unzip -n ai-vs-human-text.zip
!rm ai-vs-human-text.zip

# -------------------------

In [None]:
# --- IMPORTS ---

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
import torch

# -------------------------

In [3]:
# --- USEFUL FUNCTIONS ----

def clean_text(text):
 """
 This funtion get's rid of nonalphabetical characters, stopwords and lower cases the text.

 Args:
 text (str): The text to be cleaned

 Returns:
 text (str): The cleaned text

 Example:
 df['text'] = df['text'].apply(clean_text)
 """
 text = re.sub(r'[^a-zA-Z]', ' ', text)
 text = text.lower()
 words = text.split()
 text = [word for word in words if not word in stopwords]
 text = ' '.join(words)
 return text

def tokenize_function(dataframe):
 """
 This funtion tokenizes the 'text' field of the dataframe.

 Args:
 dataframe (pandas.DataFrame): The dataframe to be tokenized

 Returns:
 dataframe (pandas.DataFrame): The tokenized dataframe

 Example and output:
 train_dataset_token = train_dataset.map(tokenize_function, batched=True)
 """
 return tokenizer(dataframe["text"], truncation=True)

def compute_metrics(eval_pred):
 """
 This funtion computes the accuracy, precision, recall and f1 score of the model.

 It'is passed to the trainer and it outputs when evaluating the model.

 Args:
 eval_pred (tuple): The predictions and labels of the model

 Returns:
 dict: The accuracy, precision, recall and f1 score of the model

 Example:
 >>> trainer.evaluate()
 {
 'accuracy': accuracy,
 'precision': precision,
 'recall': recall,
 'f1': f1
 }
 """
 predictions, labels = eval_pred
 predictions = predictions.argmax(axis=-1)
 accuracy = accuracy_score(labels, predictions)
 precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
 return {
 'accuracy': accuracy,
 'precision': precision,
 'recall': recall,
 'f1': f1
 }

# -------------------------

In [None]:
# --- INSTANTIATING THE MODEL ---

# Load the initial tokenizer and model to set the number of labels its going to classify as 2
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# -------------------------------

In [None]:
# --- DATA PREPROCESSING ---

df = pd.read_csv('AI_Human.csv')

# Separate human from ai
df_human = df[df["generated"] == 0]
df_ai = df[df["generated"] == 1]

# We take as many human written esssays as AI generate since the dataset is a bit unbalanced
df_ai_len = df_ai["text"].count()
df_human = df_human.sample(n=df_ai_len)

# We concatenate both dataframes, shuffle them and then we take 1% of them since those will be enough to fine tune the model
# and with my current resources I won't be able to process more. For better results increase the fraction of the data used.
df_unshuffled = pd.concat([df_human, df_ai])
df = df_unshuffled.sample(frac=0.01).reset_index(drop=True)

# Get rid of nonalphatetical characters, stopwords and we lower case it.
df['text'] = df['text'].apply(clean_text)

# Split in train/test (I used 80%/20%)
df_train, df_test = train_test_split(df, test_size=0.2)

# According to the transformers library of hugging face the targets column name should be labels and ints
df_train = df_train.rename(columns={'generated': 'labels'})
df_test = df_test.rename(columns={'generated': 'labels'})
df_train['labels'] = df_train['labels'].astype(int)
df_test['labels'] = df_test['labels'].astype(int)

# We convert the pandas dataframe into hugging face datasets and tokenize both of them
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
train_dataset_token = train_dataset.map(tokenize_function, batched=True)
test_dataset_token = test_dataset.map(tokenize_function, batched=True)

# Drop columns that are not necessary and set the dataset format to pytorch tensors
train_dataset_token = train_dataset_token.remove_columns(["text", "__index_level_0__", "token_type_ids"])
test_dataset_token = test_dataset_token.remove_columns(["text", "__index_level_0__", "token_type_ids"])
train_dataset_token.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset_token.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# -------------------------


In [None]:
# --- INSTANTIATING TRAINER ---

# We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create the training arguments
training_args = TrainingArguments(
 output_dir="./results",
 per_device_train_batch_size=16, # Adjust based on GPU memory
 per_device_eval_batch_size=16,
 num_train_epochs=3,
 weight_decay=0.01,
 logging_dir="./logs",
 logging_steps=100,
)

# Create the trainer
trainer = Trainer(
 model,
 training_args,
 train_dataset=train_dataset_token,
 eval_dataset=test_dataset_token,
 data_collator=data_collator,
 tokenizer=tokenizer,
 compute_metrics = compute_metrics
)

# -------------------------

In [None]:
# --- TRAINING ---

trainer.train()

# ----------------

In [None]:
# --- EVALUATION ---

evaluation_results = trainer.evaluate()

print("Accuracy:", evaluation_results['eval_accuracy'])
print("Precision:", evaluation_results['eval_precision'])
print("Recall:", evaluation_results['eval_recall'])
print("F1:", evaluation_results['eval_f1'])

# -------------------------

In [None]:
# --- EXPORTING THE MODEL (optional) ---

# Save the model and tokenizer
#model.save_pretrained("./AI-Detector-Model/Model")
#tokenizer.save_pretrained("./AI-Detector-Model/Tokenizer")

# Zip the model
#!zip -r AI-Detector-Model.zip AI-Detector-Model

# --------------------------