{
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.13",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "kaggle": {
      "accelerator": "none",
      "dataSources": [],
      "dockerImageVersionId": 30746,
      "isInternetEnabled": false,
      "language": "python",
      "sourceType": "notebook",
      "isGpuEnabled": false
    }
  },
  "nbformat_minor": 0,
  "nbformat": 4,
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "# --- INSTALLATION ---\n",
        "\n",
        "!pip install pandas numpy matplotlib nltk scikit-learn transformers datasets torch\n",
        "!kaggle datasets download -d shanegerami/ai-vs-human-text\n",
        "!unzip -n ai-vs-human-text.zip\n",
        "!rm ai-vs-human-text.zip\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "XKWBDF8lir6o",
        "execution": {
          "iopub.status.busy": "2024-08-14T18:13:18.903225Z",
          "iopub.execute_input": "2024-08-14T18:13:18.903635Z",
          "iopub.status.idle": "2024-08-14T18:14:34.119173Z",
          "shell.execute_reply.started": "2024-08-14T18:13:18.903599Z",
          "shell.execute_reply": "2024-08-14T18:14:34.117649Z"
        },
        "trusted": true
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- IMPORTS ---\n",
        "\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import re\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "nltk.download('stopwords')\n",
        "stopwords = set(stopwords.words('english'))\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
        "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
        "from transformers import Trainer, TrainingArguments, DataCollatorWithPadding\n",
        "from datasets import Dataset\n",
        "import torch\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "q9TGKRUIiPMy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- USEFUL FUNCTIONS ----\n",
        "\n",
        "def clean_text(text):\n",
        "    \"\"\"\n",
        "    This funtion get's rid of nonalphabetical characters, stopwords and lower cases the text.\n",
        "\n",
        "    Args:\n",
        "    text (str): The text to be cleaned\n",
        "\n",
        "    Returns:\n",
        "    text (str): The cleaned text\n",
        "\n",
        "    Example:\n",
        "    df['text'] = df['text'].apply(clean_text)\n",
        "    \"\"\"\n",
        "    text = re.sub(r'[^a-zA-Z]', ' ', text)\n",
        "    text = text.lower()\n",
        "    words = text.split()\n",
        "    text = [word for word in words if not word in stopwords]\n",
        "    text = ' '.join(words)\n",
        "    return text\n",
        "\n",
        "def tokenize_function(dataframe):\n",
        "    \"\"\"\n",
        "    This funtion tokenizes the 'text' field of the dataframe.\n",
        "\n",
        "    Args:\n",
        "    dataframe (pandas.DataFrame): The dataframe to be tokenized\n",
        "\n",
        "    Returns:\n",
        "    dataframe (pandas.DataFrame): The tokenized dataframe\n",
        "\n",
        "    Example and output:\n",
        "    train_dataset_token = train_dataset.map(tokenize_function, batched=True)\n",
        "    \"\"\"\n",
        "    return tokenizer(dataframe[\"text\"], truncation=True)\n",
        "\n",
        "def compute_metrics(eval_pred):\n",
        "    \"\"\"\n",
        "    This funtion computes the accuracy, precision, recall and f1 score of the model.\n",
        "\n",
        "    It'is passed to the trainer and it outputs when evaluating the model.\n",
        "\n",
        "    Args:\n",
        "    eval_pred (tuple): The predictions and labels of the model\n",
        "\n",
        "    Returns:\n",
        "    dict: The accuracy, precision, recall and f1 score of the model\n",
        "\n",
        "    Example:\n",
        "    >>> trainer.evaluate()\n",
        "    {\n",
        "        'accuracy': accuracy,\n",
        "        'precision': precision,\n",
        "        'recall': recall,\n",
        "        'f1': f1\n",
        "    }\n",
        "    \"\"\"\n",
        "    predictions, labels = eval_pred\n",
        "    predictions = predictions.argmax(axis=-1)\n",
        "    accuracy = accuracy_score(labels, predictions)\n",
        "    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')\n",
        "    return {\n",
        "        'accuracy': accuracy,\n",
        "        'precision': precision,\n",
        "        'recall': recall,\n",
        "        'f1': f1\n",
        "    }\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "JtYsc4hJAnk3"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- INSTANTIATING THE MODEL ---\n",
        "\n",
        "# Load the initial tokenizer and model to set the number of labels its going to classify as 2\n",
        "checkpoint = \"bert-base-uncased\"\n",
        "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
        "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
        "\n",
        "# -------------------------------"
      ],
      "metadata": {
        "id": "Golh92ee33aA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- DATA PREPROCESSING ---\n",
        "\n",
        "df = pd.read_csv('AI_Human.csv')\n",
        "\n",
        "# Separate human from ai\n",
        "df_human = df[df[\"generated\"] == 0]\n",
        "df_ai = df[df[\"generated\"] == 1]\n",
        "\n",
        "# We take as many human written esssays as AI generate since the dataset is a bit unbalanced\n",
        "df_ai_len = df_ai[\"text\"].count()\n",
        "df_human = df_human.sample(n=df_ai_len)\n",
        "\n",
        "# We concatenate both dataframes, shuffle them and then we take 1% of them since those will be enough to fine tune the model\n",
        "# and with my current resources I won't be able to process more. For better results increase the fraction of the data used.\n",
        "df_unshuffled = pd.concat([df_human, df_ai])\n",
        "df = df_unshuffled.sample(frac=0.01).reset_index(drop=True)\n",
        "\n",
        "# Get rid of nonalphatetical characters, stopwords and we lower case it.\n",
        "df['text'] = df['text'].apply(clean_text)\n",
        "\n",
        "# Split in train/test (I used 80%/20%)\n",
        "df_train, df_test = train_test_split(df, test_size=0.2)\n",
        "\n",
        "# According to the transformers library of hugging face the targets column name should be labels and ints\n",
        "df_train = df_train.rename(columns={'generated': 'labels'})\n",
        "df_test = df_test.rename(columns={'generated': 'labels'})\n",
        "df_train['labels'] = df_train['labels'].astype(int)\n",
        "df_test['labels'] = df_test['labels'].astype(int)\n",
        "\n",
        "# We convert the pandas dataframe into hugging face datasets and tokenize both of them\n",
        "train_dataset = Dataset.from_pandas(df_train)\n",
        "test_dataset = Dataset.from_pandas(df_test)\n",
        "train_dataset_token = train_dataset.map(tokenize_function, batched=True)\n",
        "test_dataset_token = test_dataset.map(tokenize_function, batched=True)\n",
        "\n",
        "# Drop columns that are not necessary and set the dataset format to pytorch tensors\n",
        "train_dataset_token = train_dataset_token.remove_columns([\"text\", \"__index_level_0__\", \"token_type_ids\"])\n",
        "test_dataset_token = test_dataset_token.remove_columns([\"text\", \"__index_level_0__\", \"token_type_ids\"])\n",
        "train_dataset_token.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
        "test_dataset_token.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
        "\n",
        "# -------------------------\n"
      ],
      "metadata": {
        "id": "GUNv7d5lkg2z"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- INSTANTIATING TRAINER ---\n",
        "\n",
        "# We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training\n",
        "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
        "\n",
        "# Create the training arguments\n",
        "training_args = TrainingArguments(\n",
        "    output_dir=\"./results\",\n",
        "    per_device_train_batch_size=16,  # Adjust based on GPU memory\n",
        "    per_device_eval_batch_size=16,\n",
        "    num_train_epochs=3,\n",
        "    weight_decay=0.01,\n",
        "    logging_dir=\"./logs\",\n",
        "    logging_steps=100,\n",
        ")\n",
        "\n",
        "# Create the trainer\n",
        "trainer = Trainer(\n",
        "    model,\n",
        "    training_args,\n",
        "    train_dataset=train_dataset_token,\n",
        "    eval_dataset=test_dataset_token,\n",
        "    data_collator=data_collator,\n",
        "    tokenizer=tokenizer,\n",
        "    compute_metrics = compute_metrics\n",
        ")\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "FhqLhZv5HFot"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- TRAINING ---\n",
        "\n",
        "trainer.train()\n",
        "\n",
        "# ----------------"
      ],
      "metadata": {
        "id": "T65B4LitLfsN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- EVALUATION ---\n",
        "\n",
        "evaluation_results = trainer.evaluate()\n",
        "\n",
        "print(\"Accuracy:\", evaluation_results['eval_accuracy'])\n",
        "print(\"Precision:\", evaluation_results['eval_precision'])\n",
        "print(\"Recall:\", evaluation_results['eval_recall'])\n",
        "print(\"F1:\", evaluation_results['eval_f1'])\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "WkQgrxgFPkpJ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- EXPORTING THE MODEL (optional) ---\n",
        "\n",
        "# Save the model and tokenizer\n",
        "#model.save_pretrained(\"./AI-Detector-Model/Model\")\n",
        "#tokenizer.save_pretrained(\"./AI-Detector-Model/Tokenizer\")\n",
        "\n",
        "# Zip the model\n",
        "#!zip -r AI-Detector-Model.zip AI-Detector-Model\n",
        "\n",
        "# --------------------------"
      ],
      "metadata": {
        "id": "DF-ZWbjHSxuE"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}