{ "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python", "version": "3.10.13", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kaggle": { "accelerator": "none", "dataSources": [], "dockerImageVersionId": 30746, "isInternetEnabled": false, "language": "python", "sourceType": "notebook", "isGpuEnabled": false } }, "nbformat_minor": 0, "nbformat": 4, "cells": [ { "cell_type": "code", "source": [ "# --- INSTALLATION ---\n", "\n", "!pip install pandas numpy matplotlib nltk scikit-learn transformers datasets torch\n", "!kaggle datasets download -d shanegerami/ai-vs-human-text\n", "!unzip -n ai-vs-human-text.zip\n", "!rm ai-vs-human-text.zip\n", "\n", "# -------------------------" ], "metadata": { "id": "XKWBDF8lir6o", "execution": { "iopub.status.busy": "2024-08-14T18:13:18.903225Z", "iopub.execute_input": "2024-08-14T18:13:18.903635Z", "iopub.status.idle": "2024-08-14T18:14:34.119173Z", "shell.execute_reply.started": "2024-08-14T18:13:18.903599Z", "shell.execute_reply": "2024-08-14T18:14:34.117649Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# --- IMPORTS ---\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import re\n", "import nltk\n", "from nltk.corpus import stopwords\n", "nltk.download('stopwords')\n", "stopwords = set(stopwords.words('english'))\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "from transformers import Trainer, TrainingArguments, DataCollatorWithPadding\n", "from datasets import Dataset\n", "import torch\n", "\n", "# -------------------------" ], "metadata": { "id": "q9TGKRUIiPMy" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# --- USEFUL FUNCTIONS ----\n", "\n", "def clean_text(text):\n", " \"\"\"\n", " This funtion get's rid of nonalphabetical characters, stopwords and lower cases the text.\n", "\n", " Args:\n", " text (str): The text to be cleaned\n", "\n", " Returns:\n", " text (str): The cleaned text\n", "\n", " Example:\n", " df['text'] = df['text'].apply(clean_text)\n", " \"\"\"\n", " text = re.sub(r'[^a-zA-Z]', ' ', text)\n", " text = text.lower()\n", " words = text.split()\n", " text = [word for word in words if not word in stopwords]\n", " text = ' '.join(words)\n", " return text\n", "\n", "def tokenize_function(dataframe):\n", " \"\"\"\n", " This funtion tokenizes the 'text' field of the dataframe.\n", "\n", " Args:\n", " dataframe (pandas.DataFrame): The dataframe to be tokenized\n", "\n", " Returns:\n", " dataframe (pandas.DataFrame): The tokenized dataframe\n", "\n", " Example and output:\n", " train_dataset_token = train_dataset.map(tokenize_function, batched=True)\n", " \"\"\"\n", " return tokenizer(dataframe[\"text\"], truncation=True)\n", "\n", "def compute_metrics(eval_pred):\n", " \"\"\"\n", " This funtion computes the accuracy, precision, recall and f1 score of the model.\n", "\n", " It'is passed to the trainer and it outputs when evaluating the model.\n", "\n", " Args:\n", " eval_pred (tuple): The predictions and labels of the model\n", "\n", " Returns:\n", " dict: The accuracy, precision, recall and f1 score of the model\n", "\n", " Example:\n", " >>> trainer.evaluate()\n", " {\n", " 'accuracy': accuracy,\n", " 'precision': precision,\n", " 'recall': recall,\n", " 'f1': f1\n", " }\n", " \"\"\"\n", " predictions, labels = eval_pred\n", " predictions = predictions.argmax(axis=-1)\n", " accuracy = accuracy_score(labels, predictions)\n", " precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')\n", " return {\n", " 'accuracy': accuracy,\n", " 'precision': precision,\n", " 'recall': recall,\n", " 'f1': f1\n", " }\n", "\n", "# -------------------------" ], "metadata": { "id": "JtYsc4hJAnk3" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "# --- INSTANTIATING THE MODEL ---\n", "\n", "# Load the initial tokenizer and model to set the number of labels its going to classify as 2\n", "checkpoint = \"bert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "\n", "# -------------------------------" ], "metadata": { "id": "Golh92ee33aA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# --- DATA PREPROCESSING ---\n", "\n", "df = pd.read_csv('AI_Human.csv')\n", "\n", "# Separate human from ai\n", "df_human = df[df[\"generated\"] == 0]\n", "df_ai = df[df[\"generated\"] == 1]\n", "\n", "# We take as many human written esssays as AI generate since the dataset is a bit unbalanced\n", "df_ai_len = df_ai[\"text\"].count()\n", "df_human = df_human.sample(n=df_ai_len)\n", "\n", "# We concatenate both dataframes, shuffle them and then we take 1% of them since those will be enough to fine tune the model\n", "# and with my current resources I won't be able to process more. For better results increase the fraction of the data used.\n", "df_unshuffled = pd.concat([df_human, df_ai])\n", "df = df_unshuffled.sample(frac=0.01).reset_index(drop=True)\n", "\n", "# Get rid of nonalphatetical characters, stopwords and we lower case it.\n", "df['text'] = df['text'].apply(clean_text)\n", "\n", "# Split in train/test (I used 80%/20%)\n", "df_train, df_test = train_test_split(df, test_size=0.2)\n", "\n", "# According to the transformers library of hugging face the targets column name should be labels and ints\n", "df_train = df_train.rename(columns={'generated': 'labels'})\n", "df_test = df_test.rename(columns={'generated': 'labels'})\n", "df_train['labels'] = df_train['labels'].astype(int)\n", "df_test['labels'] = df_test['labels'].astype(int)\n", "\n", "# We convert the pandas dataframe into hugging face datasets and tokenize both of them\n", "train_dataset = Dataset.from_pandas(df_train)\n", "test_dataset = Dataset.from_pandas(df_test)\n", "train_dataset_token = train_dataset.map(tokenize_function, batched=True)\n", "test_dataset_token = test_dataset.map(tokenize_function, batched=True)\n", "\n", "# Drop columns that are not necessary and set the dataset format to pytorch tensors\n", "train_dataset_token = train_dataset_token.remove_columns([\"text\", \"__index_level_0__\", \"token_type_ids\"])\n", "test_dataset_token = test_dataset_token.remove_columns([\"text\", \"__index_level_0__\", \"token_type_ids\"])\n", "train_dataset_token.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n", "test_dataset_token.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n", "\n", "# -------------------------\n" ], "metadata": { "id": "GUNv7d5lkg2z" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# --- INSTANTIATING TRAINER ---\n", "\n", "# We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "\n", "# Create the training arguments\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\",\n", " per_device_train_batch_size=16, # Adjust based on GPU memory\n", " per_device_eval_batch_size=16,\n", " num_train_epochs=3,\n", " weight_decay=0.01,\n", " logging_dir=\"./logs\",\n", " logging_steps=100,\n", ")\n", "\n", "# Create the trainer\n", "trainer = Trainer(\n", " model,\n", " training_args,\n", " train_dataset=train_dataset_token,\n", " eval_dataset=test_dataset_token,\n", " data_collator=data_collator,\n", " tokenizer=tokenizer,\n", " compute_metrics = compute_metrics\n", ")\n", "\n", "# -------------------------" ], "metadata": { "id": "FhqLhZv5HFot" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# --- TRAINING ---\n", "\n", "trainer.train()\n", "\n", "# ----------------" ], "metadata": { "id": "T65B4LitLfsN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# --- EVALUATION ---\n", "\n", "evaluation_results = trainer.evaluate()\n", "\n", "print(\"Accuracy:\", evaluation_results['eval_accuracy'])\n", "print(\"Precision:\", evaluation_results['eval_precision'])\n", "print(\"Recall:\", evaluation_results['eval_recall'])\n", "print(\"F1:\", evaluation_results['eval_f1'])\n", "\n", "# -------------------------" ], "metadata": { "id": "WkQgrxgFPkpJ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# --- EXPORTING THE MODEL (optional) ---\n", "\n", "# Save the model and tokenizer\n", "#model.save_pretrained(\"./AI-Detector-Model/Model\")\n", "#tokenizer.save_pretrained(\"./AI-Detector-Model/Tokenizer\")\n", "\n", "# Zip the model\n", "#!zip -r AI-Detector-Model.zip AI-Detector-Model\n", "\n", "# --------------------------" ], "metadata": { "id": "DF-ZWbjHSxuE" }, "execution_count": null, "outputs": [] } ] }