ki1207
/

wildlifemultimodal

Model card Files Files and versions Community

File size: 28,452 Bytes

d8e4d0b

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "b31a5ac5-a8b7-4ef0-a5d8-41d59e5c37df",
   "metadata": {},
   "outputs": [],
   "source": [
    "from minio import Minio\n",
    "import duckdb\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "5684f535-7bbc-435d-afb2-504cbc647688",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7fb621719a30>"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add the access key cell here\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "f8094e79-0538-4771-af47-9a11d1c58d1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "bucket_name = \"ads-with-images\"\n",
    "object_name = \"aug_ads_w_images.parquet\"\n",
    "\n",
    "df = connection.execute(f\"SELECT * FROM parquet_scan('s3://{bucket_name}/{object_name}');\").df()\n",
    "df = df.iloc[:1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "41567a9b-3b33-4719-b044-7b7264b943f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>url</th>\n",
       "      <th>title</th>\n",
       "      <th>text</th>\n",
       "      <th>domain</th>\n",
       "      <th>name</th>\n",
       "      <th>description</th>\n",
       "      <th>image</th>\n",
       "      <th>retrieved</th>\n",
       "      <th>production_data</th>\n",
       "      <th>category</th>\n",
       "      <th>...</th>\n",
       "      <th>loc_name</th>\n",
       "      <th>lat</th>\n",
       "      <th>lon</th>\n",
       "      <th>country</th>\n",
       "      <th>label</th>\n",
       "      <th>score</th>\n",
       "      <th>product</th>\n",
       "      <th>label_product</th>\n",
       "      <th>score_product</th>\n",
       "      <th>path</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://www.ebay.ie/itm/225712575933?amdata=en...</td>\n",
       "      <td>Mofusand Exhibition Mofusando Piece Set Towel ...</td>\n",
       "      <td>\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nMofusa...</td>\n",
       "      <td>ebay.ie</td>\n",
       "      <td>Mofusand Exhibition Mofusando Piece Set Towel ...</td>\n",
       "      <td>Mofusand Exhibition Mofusando Piece Set Towel ...</td>\n",
       "      <td>https://i.ebayimg.com/images/g/CUQAAOSwatNkz~R...</td>\n",
       "      <td>2023-02-10T17:05:03.245+0000</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.990042</td>\n",
       "      <td>Mofusand Exhibition Mofusando Piece Set Towel ...</td>\n",
       "      <td>an object</td>\n",
       "      <td>0.454573</td>\n",
       "      <td>images-august/None/e8bf594b-d5bb-43b5-a1b1-18f...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://www.ebay.nl/itm/165510736761?amdata=en...</td>\n",
       "      <td>ATLANTIC RIGHT WHALE =Endangered= IDENTICAL St...</td>\n",
       "      <td>\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nATLANT...</td>\n",
       "      <td>ebay.nl</td>\n",
       "      <td>ATLANTIC RIGHT WHALE =Endangered= IDENTICAL St...</td>\n",
       "      <td>ATLANTIC RIGHT WHALE =Endangered= IDENTICAL St...</td>\n",
       "      <td>https://i.ebayimg.com/images/g/m8cAAOSwfbNillr...</td>\n",
       "      <td>2023-02-10T17:05:03.245+0000</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.977253</td>\n",
       "      <td>ATLANTIC RIGHT WHALE =Endangered= IDENTICAL St...</td>\n",
       "      <td>an object</td>\n",
       "      <td>0.728779</td>\n",
       "      <td>images-august/None/286b368a-8adf-46ce-9331-e29...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://www.ebay.nl/itm/175641098586?amdata=en...</td>\n",
       "      <td>Mountain Anoa Dwarf Water Buffalo DecoBronze S...</td>\n",
       "      <td>\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nMounta...</td>\n",
       "      <td>ebay.nl</td>\n",
       "      <td>Mountain Anoa Dwarf Water Buffalo DecoBronze S...</td>\n",
       "      <td>Mountain Anoa Dwarf Water Buffalo DecoBronze S...</td>\n",
       "      <td>https://i.ebayimg.com/images/g/VAMAAOSwxNlkBe4...</td>\n",
       "      <td>2023-02-10T17:05:03.245+0000</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.819862</td>\n",
       "      <td>Mountain Anoa Dwarf Water Buffalo DecoBronze S...</td>\n",
       "      <td>a print of an animal</td>\n",
       "      <td>0.504843</td>\n",
       "      <td>images-august/None/07c9d7ab-0264-44e5-8c1d-934...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://www.ebay.com.hk/itm/256119308608?amdat...</td>\n",
       "      <td>Wildlife Research Center 1255 Scent Killer Gol...</td>\n",
       "      <td>\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nWildli...</td>\n",
       "      <td>ebay.com.hk</td>\n",
       "      <td>Wildlife Research Center 1255 Scent Killer Gol...</td>\n",
       "      <td>Wildlife Research Center 1255 Scent Killer Gol...</td>\n",
       "      <td>https://i.ebayimg.com/images/g/w3IAAOSwUp9jOv-...</td>\n",
       "      <td>2023-02-10T17:05:03.245+0000</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.815647</td>\n",
       "      <td>Wildlife Research Center 1255 Scent Killer Gol...</td>\n",
       "      <td>an object</td>\n",
       "      <td>0.738707</td>\n",
       "      <td>images-august/None/871b28ed-2e92-4c3d-8e1d-2b9...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://www.ebay.at/itm/134672898114?amdata=en...</td>\n",
       "      <td>4 \"Sammeln Sie rote Kupferschnitzerei Manis pe...</td>\n",
       "      <td>\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n4 \"Sam...</td>\n",
       "      <td>ebay.at</td>\n",
       "      <td>4 \"Sammeln Sie rote Kupferschnitzerei Manis pe...</td>\n",
       "      <td>4 \"Sammeln Sie rote Kupferschnitzerei Manis pe...</td>\n",
       "      <td>https://i.ebayimg.com/images/g/c3YAAOSwM9NiNFP...</td>\n",
       "      <td>2023-02-10T17:05:03.245+0000</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>None</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.734359</td>\n",
       "      <td>4 \"Sammeln Sie rote Kupferschnitzerei Manis pe...</td>\n",
       "      <td>a print of an animal</td>\n",
       "      <td>0.538278</td>\n",
       "      <td>images-august/None/7bfee649-2c7c-46b9-b5b4-37e...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 28 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 url  \\\n",
       "0  https://www.ebay.ie/itm/225712575933?amdata=en...   \n",
       "1  https://www.ebay.nl/itm/165510736761?amdata=en...   \n",
       "2  https://www.ebay.nl/itm/175641098586?amdata=en...   \n",
       "3  https://www.ebay.com.hk/itm/256119308608?amdat...   \n",
       "4  https://www.ebay.at/itm/134672898114?amdata=en...   \n",
       "\n",
       "                                               title  \\\n",
       "0  Mofusand Exhibition Mofusando Piece Set Towel ...   \n",
       "1  ATLANTIC RIGHT WHALE =Endangered= IDENTICAL St...   \n",
       "2  Mountain Anoa Dwarf Water Buffalo DecoBronze S...   \n",
       "3  Wildlife Research Center 1255 Scent Killer Gol...   \n",
       "4  4 \"Sammeln Sie rote Kupferschnitzerei Manis pe...   \n",
       "\n",
       "                                                text       domain  \\\n",
       "0  \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nMofusa...      ebay.ie   \n",
       "1  \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nATLANT...      ebay.nl   \n",
       "2  \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nMounta...      ebay.nl   \n",
       "3  \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nWildli...  ebay.com.hk   \n",
       "4  \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n4 \"Sam...      ebay.at   \n",
       "\n",
       "                                                name  \\\n",
       "0  Mofusand Exhibition Mofusando Piece Set Towel ...   \n",
       "1  ATLANTIC RIGHT WHALE =Endangered= IDENTICAL St...   \n",
       "2  Mountain Anoa Dwarf Water Buffalo DecoBronze S...   \n",
       "3  Wildlife Research Center 1255 Scent Killer Gol...   \n",
       "4  4 \"Sammeln Sie rote Kupferschnitzerei Manis pe...   \n",
       "\n",
       "                                         description  \\\n",
       "0  Mofusand Exhibition Mofusando Piece Set Towel ...   \n",
       "1  ATLANTIC RIGHT WHALE =Endangered= IDENTICAL St...   \n",
       "2  Mountain Anoa Dwarf Water Buffalo DecoBronze S...   \n",
       "3  Wildlife Research Center 1255 Scent Killer Gol...   \n",
       "4  4 \"Sammeln Sie rote Kupferschnitzerei Manis pe...   \n",
       "\n",
       "                                               image  \\\n",
       "0  https://i.ebayimg.com/images/g/CUQAAOSwatNkz~R...   \n",
       "1  https://i.ebayimg.com/images/g/m8cAAOSwfbNillr...   \n",
       "2  https://i.ebayimg.com/images/g/VAMAAOSwxNlkBe4...   \n",
       "3  https://i.ebayimg.com/images/g/w3IAAOSwUp9jOv-...   \n",
       "4  https://i.ebayimg.com/images/g/c3YAAOSwM9NiNFP...   \n",
       "\n",
       "                      retrieved production_data category  ...  loc_name lat  \\\n",
       "0  2023-02-10T17:05:03.245+0000            None     None  ...      None NaN   \n",
       "1  2023-02-10T17:05:03.245+0000            None     None  ...      None NaN   \n",
       "2  2023-02-10T17:05:03.245+0000            None     None  ...      None NaN   \n",
       "3  2023-02-10T17:05:03.245+0000            None     None  ...      None NaN   \n",
       "4  2023-02-10T17:05:03.245+0000            None     None  ...      None NaN   \n",
       "\n",
       "  lon country label     score  \\\n",
       "0 NaN    None   0.0  0.990042   \n",
       "1 NaN    None   1.0  0.977253   \n",
       "2 NaN    None   0.0  0.819862   \n",
       "3 NaN    None   1.0  0.815647   \n",
       "4 NaN    None   1.0  0.734359   \n",
       "\n",
       "                                             product         label_product  \\\n",
       "0  Mofusand Exhibition Mofusando Piece Set Towel ...             an object   \n",
       "1  ATLANTIC RIGHT WHALE =Endangered= IDENTICAL St...             an object   \n",
       "2  Mountain Anoa Dwarf Water Buffalo DecoBronze S...  a print of an animal   \n",
       "3  Wildlife Research Center 1255 Scent Killer Gol...             an object   \n",
       "4  4 \"Sammeln Sie rote Kupferschnitzerei Manis pe...  a print of an animal   \n",
       "\n",
       "  score_product                                               path  \n",
       "0      0.454573  images-august/None/e8bf594b-d5bb-43b5-a1b1-18f...  \n",
       "1      0.728779  images-august/None/286b368a-8adf-46ce-9331-e29...  \n",
       "2      0.504843  images-august/None/07c9d7ab-0264-44e5-8c1d-934...  \n",
       "3      0.738707  images-august/None/871b28ed-2e92-4c3d-8e1d-2b9...  \n",
       "4      0.538278  images-august/None/7bfee649-2c7c-46b9-b5b4-37e...  \n",
       "\n",
       "[5 rows x 28 columns]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "7f1857c7-e6ad-4de6-8f27-be8e1bbc4bfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import DataLoader, Dataset, random_split\n",
    "from transformers import BertTokenizer, BertForSequenceClassification, AdamW\n",
    "from torchvision.transforms import transforms\n",
    "import torch.nn as nn\n",
    "import torch\n",
    "from transformers import BertModel, RobertaModel\n",
    "from torchvision.models import resnet50\n",
    "from torch import cuda\n",
    "from PIL import Image\n",
    "import time\n",
    "from transformers import RobertaForSequenceClassification, RobertaTokenizer\n",
    "from torchvision import models\n",
    "import torch.nn.functional as F"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "614ed9c7-890c-4d7c-8510-919edceb2e04",
   "metadata": {},
   "outputs": [],
   "source": [
    "from io import BytesIO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "d520c8f6-4e2b-40a7-bbf0-c788ada1c978",
   "metadata": {},
   "outputs": [],
   "source": [
    "# We will first prepare the dataset for inference\n",
    "class InferenceDataset(Dataset):\n",
    "    def __init__(self, dataframe, tokenizer, max_len, transform=None):\n",
    "        self.dataframe = dataframe\n",
    "        self.tokenizer = tokenizer\n",
    "        self.max_len = max_len\n",
    "        self.transform = transform\n",
    "    \n",
    "    def __len__(self):\n",
    "        return len(self.dataframe)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        row = self.dataframe.iloc[idx]\n",
    "        text = row['combined_text']\n",
    "        img_path = row['path']\n",
    "        \n",
    "        # Processing text\n",
    "        encoding = self.tokenizer.encode_plus(\n",
    "            text,\n",
    "            add_special_tokens=True,\n",
    "            max_length=self.max_len,\n",
    "            return_token_type_ids=False,\n",
    "            padding='max_length',\n",
    "            return_attention_mask=True,\n",
    "            return_tensors='pt',\n",
    "            truncation=True\n",
    "        )\n",
    "        \n",
    "        bucket_name, _, object_name = img_path.partition('/')\n",
    "        image_data = client.get_object(bucket_name, object_name)\n",
    "        image_bytes = image_data.read()\n",
    "        image = Image.open(BytesIO(image_bytes)).convert(\"RGB\")\n",
    "        \n",
    "        # Processing images\n",
    "        # image = Image.open(img_path).convert(\"RGB\")\n",
    "        if self.transform:\n",
    "            image = self.transform(image)\n",
    "        \n",
    "        return {\n",
    "            'input_ids': encoding['input_ids'].flatten(),\n",
    "            'attention_mask': encoding['attention_mask'].flatten(),\n",
    "            'image': image\n",
    "        }\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "ba1acf51-dfeb-41dc-a925-2e7f7505e989",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Constants\n",
    "MAX_LEN = 128\n",
    "BATCH_SIZE = 16\n",
    "\n",
    "tokenizer = RobertaTokenizer.from_pretrained('roberta-base')\n",
    "\n",
    "transform = transforms.Compose([\n",
    "    transforms.RandomHorizontalFlip(),\n",
    "    transforms.RandomRotation(10),\n",
    "    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),\n",
    "    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),\n",
    "    transforms.ToTensor(),\n",
    "])\n",
    "\n",
    "\n",
    "# Combine text fields\n",
    "df['description'] = df['description'].fillna('')\n",
    "df['combined_text'] = df['title'] + ' ' + df['text'] + ' ' + df['description']\n",
    "\n",
    "# Prepare dataset and dataloader for inference\n",
    "inference_dataset = InferenceDataset(\n",
    "    dataframe=df,\n",
    "    tokenizer=tokenizer,  \n",
    "    max_len=MAX_LEN,\n",
    "    transform=transform\n",
    ")\n",
    "\n",
    "inference_dataloader = DataLoader(inference_dataset, batch_size=BATCH_SIZE, shuffle=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "a779c8bf-fd40-4558-9e15-41599e96ed59",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "/home/jovyan/.local/lib/python3.9/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.\n",
      "  warnings.warn(\n",
      "/home/jovyan/.local/lib/python3.9/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=Swin_V2_B_Weights.IMAGENET1K_V1`. You can also use `weights=Swin_V2_B_Weights.DEFAULT` to get the most up-to-date weights.\n",
      "  warnings.warn(msg)\n"
     ]
    }
   ],
   "source": [
    "text = RobertaModel.from_pretrained('roberta-base')\n",
    "\n",
    "img = models.swin_v2_b(pretrained=True)\n",
    "\n",
    "class MultiHeadCrossAttention(nn.Module):\n",
    "    def __init__(self, text_dim, image_dim, num_heads, hidden_dim, output_dim):\n",
    "        super(MultiHeadCrossAttention, self).__init__()\n",
    "        self.num_heads = num_heads\n",
    "        self.head_dim = hidden_dim // num_heads\n",
    "\n",
    "        # These linear layers project the inputs to multiple heads\n",
    "        self.text_query = nn.Linear(text_dim, hidden_dim, bias=False)\n",
    "        self.text_key = nn.Linear(text_dim, hidden_dim, bias=False)\n",
    "        self.text_value = nn.Linear(text_dim, hidden_dim, bias=False)\n",
    "\n",
    "        self.image_query = nn.Linear(image_dim, hidden_dim, bias=False)\n",
    "        self.image_key = nn.Linear(image_dim, hidden_dim, bias=False)\n",
    "        self.image_value = nn.Linear(image_dim, hidden_dim, bias=False)\n",
    "\n",
    "        # Final projection layer\n",
    "        self.out_proj = nn.Linear(hidden_dim, output_dim, bias=False)\n",
    "\n",
    "    def forward(self, text_features, image_features):\n",
    "        Q_text = self.text_query(text_features)\n",
    "        K_text = self.text_key(text_features)\n",
    "        V_text = self.text_value(text_features)\n",
    "\n",
    "        Q_image = self.image_query(image_features)\n",
    "        K_image = self.image_key(image_features)\n",
    "        V_image = self.image_value(image_features)\n",
    "\n",
    "        # Split the hidden dimension into num_heads\n",
    "        Q_text = Q_text.view(Q_text.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)\n",
    "        K_text = K_text.view(K_text.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)\n",
    "        V_text = V_text.view(V_text.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)\n",
    "\n",
    "        Q_image = Q_image.view(Q_image.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)\n",
    "        K_image = K_image.view(K_image.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)\n",
    "        V_image = V_image.view(V_image.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)\n",
    "\n",
    "        # Calculate the attention scores\n",
    "        attn_scores_text_image = torch.matmul(Q_text, K_image.transpose(-1, -2)) / (self.head_dim ** 0.5)\n",
    "        attn_scores_image_text = torch.matmul(Q_image, K_text.transpose(-1, -2)) / (self.head_dim ** 0.5)\n",
    "\n",
    "        # Normalize scores\n",
    "        attn_probs_text_image = F.softmax(attn_scores_text_image, dim=-1)\n",
    "        attn_probs_image_text = F.softmax(attn_scores_image_text, dim=-1)\n",
    "\n",
    "        # Apply attention\n",
    "        attn_output_text_image = torch.matmul(attn_probs_text_image, V_image)\n",
    "        attn_output_image_text = torch.matmul(attn_probs_image_text, V_text)\n",
    "\n",
    "        # Concatenate the results across the heads\n",
    "        attn_output_text_image = attn_output_text_image.transpose(1, 2).contiguous().view(text_features.size(0), -1)\n",
    "        attn_output_image_text = attn_output_image_text.transpose(1, 2).contiguous().view(image_features.size(0), -1)\n",
    "\n",
    "        # Project to output dimension\n",
    "        output_text_image = self.out_proj(attn_output_text_image)\n",
    "        output_image_text = self.out_proj(attn_output_image_text)\n",
    "\n",
    "        return output_text_image, output_image_text\n",
    "\n",
    "\n",
    "class MultiModalModel(nn.Module):\n",
    "            def __init__(self, num_labels):\n",
    "                super(MultiModalModel, self).__init__()\n",
    "\n",
    "                # Load pre-trained models\n",
    "                self.bert = text\n",
    "                self.resnet = img\n",
    "\n",
    "                # Remove the final classification layer of ResNet\n",
    "                self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])\n",
    "                self.mhca = MultiHeadCrossAttention(text_dim=768, image_dim=1024, num_heads=4, hidden_dim=512, output_dim=2048)\n",
    "                \n",
    "                \n",
    "                self.classifier = nn.Sequential(\n",
    "                    nn.Linear(2816, 512), \n",
    "                    nn.ReLU(),\n",
    "                    nn.Dropout(0.2),\n",
    "                    nn.Linear(512, num_labels)\n",
    "                )\n",
    "\n",
    "            def forward(self, input_ids, attention_mask, image):\n",
    "                # Forward pass through BERT\n",
    "                outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
    "                text_features = outputs['last_hidden_state'][:, 0, :]  # CLS token output as text feature\n",
    "                \n",
    "                # Forward pass through ResNet\n",
    "                image_features = self.resnet(image)\n",
    "                image_features = image_features.view(image_features.size(0), -1)  # Flatten the output\n",
    "                \n",
    "                if text_features.dim() == 2:\n",
    "                    text_features = text_features.unsqueeze(1)\n",
    "                if image_features.dim() == 2:\n",
    "                    image_features = image_features.unsqueeze(1)\n",
    "                \n",
    "                attended_text, attended_image = self.mhca(text_features, image_features)\n",
    "                \n",
    "                attended_text = attended_text.squeeze(1)  # shape: [16, 768]\n",
    "                attended_image = attended_image.squeeze(1) # shape: [16, 2048]\n",
    "\n",
    "                self.image_projection = torch.nn.Linear(2048, 768).to(device)\n",
    "                attended_image = self.image_projection(attended_image)\n",
    "                combined_features = torch.cat((attended_text, attended_image), dim=-1)\n",
    "\n",
    "                logits = self.classifier(combined_features)\n",
    "\n",
    "                return logits\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "568e1b32-8505-4c2e-9b44-89563ebb7724",
   "metadata": {},
   "outputs": [],
   "source": [
    "        import urllib3, socket\n",
    "        from urllib3.connection import HTTPConnection\n",
    "    \n",
    "        HTTPConnection.default_socket_options = ( \n",
    "            HTTPConnection.default_socket_options + [\n",
    "            (socket.SOL_SOCKET, socket.SO_SNDBUF, 1000000), #1MB in byte\n",
    "            (socket.SOL_SOCKET, socket.SO_RCVBUF, 1000000)\n",
    "        ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "09306857-c737-4380-a377-a6f71b0ef0ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to run inference\n",
    "def run_inference(model, dataloader, device):\n",
    "    model.eval()\n",
    "    predictions = []\n",
    "\n",
    "    with torch.no_grad():\n",
    "        for batch in dataloader:\n",
    "            input_ids = batch['input_ids'].to(device)\n",
    "            attention_mask = batch['attention_mask'].to(device)\n",
    "            images = batch['image'].to(device)\n",
    "            \n",
    "            outputs = model(input_ids=input_ids, attention_mask=attention_mask, image=images)\n",
    "            \n",
    "            preds = torch.argmax(outputs, dim=1)\n",
    "            predictions.extend(preds.cpu().numpy())\n",
    "    \n",
    "    return predictions\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e67750b4-b917-4d26-8cfe-19913dc21496",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize an empty model\n",
    "loaded_model = MultiModalModel(num_labels=2)\n",
    "\n",
    "# Load the state dictionary\n",
    "model_load_path = \"./model.pth\"\n",
    "\n",
    "# Check device\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "# Load the model weights\n",
    "if device == torch.device('cpu'):\n",
    "    loaded_model.load_state_dict(torch.load(model_load_path, map_location=device), strict=False)\n",
    "else:\n",
    "    loaded_model.load_state_dict(torch.load(model_load_path), strict=False)\n",
    "\n",
    "# Move model to evaluation mode and to the device\n",
    "loaded_model.eval()\n",
    "loaded_model = loaded_model.to(device)\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "# Get the predictions\n",
    "predictions = run_inference(loaded_model, inference_dataloader, device)\n",
    "\n",
    "end_time = time.time()\n",
    "\n",
    "elapsed_time = end_time - start_time\n",
    "print(f\"Inference took {elapsed_time:.2f} seconds\")\n",
    "\n",
    "\n",
    "# Add predictions to the dataframe\n",
    "df['predicted_label'] = predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "296a17d9-f8bd-4e15-bb65-f0a2900a529d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}