{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from transformers import (\n", " RobertaTokenizerFast,\n", " RobertaForSequenceClassification,\n", " TrainingArguments,\n", " Trainer,\n", " AutoConfig,\n", ")\n", "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import os\n", "folder_path = 'formatted_data/'\n", "\n", "# Get the list of all files in the folder\n", "file_names = os.listdir(folder_path)\n", "max_file_name=max([int(i.split(\"_\")[-1][:-4]) for i in file_names])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "<>:2: SyntaxWarning: invalid escape sequence '\\d'\n", "<>:2: SyntaxWarning: invalid escape sequence '\\d'\n", "C:\\Users\\rajst\\AppData\\Local\\Temp\\ipykernel_18180\\3256903659.py:2: SyntaxWarning: invalid escape sequence '\\d'\n", " df=pd.read_csv(\"formatted_data\\data_\"+str(max_file_name)+\".csv\")\n" ] } ], "source": [ "device=\"cuda\"\n", "df=pd.read_csv(\"formatted_data\\data_\"+str(max_file_name)+\".csv\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "d:\\external\\Experiments\\image_designing\\env\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", " warnings.warn(\n", "Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "import torch\n", "import pandas as pd\n", "\n", "# Assuming df is already defined and contains 'text' and 'label' columns\n", "# Convert labels to numerical format if they are not already (e.g., if they are strings)\n", "label_mapping = {label: idx for idx, label in enumerate(df['label'].unique())}\n", "df['label'] = df['label'].map(label_mapping)\n", "\n", "# Split the dataset into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(df['prompt'], df['label'], test_size=0.1, random_state=42)\n", "\n", "# Define the local directory to save the model\n", "local_model_dir = \"allenai/longformer-base-4096\"\n", "\n", "# Load the tokenizer and model from the local directory\n", "tokenizer = AutoTokenizer.from_pretrained(local_model_dir, add_prefix_space=True)\n", "num_labels = len(df['label'].unique())\n", "model = AutoModelForSequenceClassification.from_pretrained(local_model_dir, num_labels=num_labels)\n", "\n", "# Tokenize the data\n", "train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)\n", "test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)\n", "\n", "# Create a Dataset class\n", "class CustomDataset(torch.utils.data.Dataset):\n", " def __init__(self, encodings, labels):\n", " self.encodings = encodings\n", " self.labels = labels\n", "\n", " def __getitem__(self, idx):\n", " # Convert the encodings into tensors\n", " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n", " # Add the labels as a tensor\n", " item['labels'] = torch.tensor(self.labels[idx])\n", " return item\n", "\n", " def __len__(self):\n", " return len(self.labels)\n", "\n", "# Create dataset objects\n", "train_dataset = CustomDataset(train_encodings, y_train.tolist())\n", "test_dataset = CustomDataset(test_encodings, y_test.tolist())\n", "\n", "# Check the structure of the first item for verification\n", "# print(train_dataset[0])\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "d:\\external\\Experiments\\image_designing\\env\\Lib\\site-packages\\transformers\\training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n", " warnings.warn(\n", " 0%| | 0/3003 [00:00