initial commit

Browse files

Files changed (11) hide show

.gitignore +1 -0
__pycache__/text_encoder.cpython-311.pyc +0 -0
__pycache__/train.cpython-311.pyc +0 -0
__pycache__/vision_encoder.cpython-311.pyc +0 -0
_dataset/__pycache__/preprocess_images.cpython-311.pyc +0 -0
_dataset/preprocess_captions.ipynb +188 -0
_dataset/preprocess_images.py +79 -0
demo.ipynb +240 -0
text_encoder.py +27 -0
train.py +204 -0
vision_encoder.py +56 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoints

__pycache__/text_encoder.cpython-311.pyc ADDED Viewed

Binary file (1.82 kB). View file

__pycache__/train.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

__pycache__/vision_encoder.cpython-311.pyc ADDED Viewed

Binary file (3.05 kB). View file

_dataset/__pycache__/preprocess_images.cpython-311.pyc ADDED Viewed

Binary file (5.8 kB). View file

_dataset/preprocess_captions.ipynb ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import defaultdict\n",
+    "from transformers import AutoTokenizer\n",
+    "from tqdm import tqdm\n",
+    "import json\n",
+    "\n",
+    "def load_and_process_token_file(input_path, tokenizer_name=\"answerdotai/ModernBERT-base\"):\n",
+    "    captions_dict = defaultdict(list)\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)\n",
+    "    max_length = 0  # Initialize max length counter\n",
+    "\n",
+    "    # Read and process the token file with tokenization\n",
+    "    with open(input_path, 'r') as file:\n",
+    "        for line in tqdm(file, desc=\"Processing Captions\"):\n",
+    "            image_id, caption = line.strip().split('\\t')\n",
+    "            jpg_number = image_id.split('.')[0]\n",
+    "            \n",
+    "            # Tokenize without padding and truncation to calculate the true length\n",
+    "            tokens = tokenizer(caption, return_tensors=\"pt\", padding=False, truncation=False)\n",
+    "            token_ids = tokens['input_ids'].squeeze(0).tolist()\n",
+    "            \n",
+    "            # Update max_length based on this tokenized sequence length\n",
+    "            max_length = max(max_length, len(token_ids))\n",
+    "            \n",
+    "            # Tokenize with padding and attention mask (padded to 93 tokens)\n",
+    "            tokens_padded = tokenizer(caption, return_tensors=\"pt\", padding=\"max_length\", truncation=True, max_length=2**7) # 93 < 2**7\n",
+    "            token_ids_padded = tokens_padded['input_ids'].squeeze(0).tolist()\n",
+    "            attention_mask = tokens_padded['attention_mask'].squeeze(0).tolist()\n",
+    "\n",
+    "            # Save both raw caption, tokenized version, and attention mask\n",
+    "            captions_dict[jpg_number].append({\n",
+    "                \"text\": caption,\n",
+    "                \"tokenized\": token_ids_padded,\n",
+    "                \"attention_mask\": attention_mask\n",
+    "            })\n",
+    "\n",
+    "    print(f\"Maximum sequence length (before padding): {max_length}\")\n",
+    "    return captions_dict, max_length\n",
+    "\n",
+    "# Define the input path and process the file\n",
+    "input_path = '/mnt/nvme/shared_A/datasets/flickr30k/data/results_20130124.token'\n",
+    "captions_dict, max_length = load_and_process_token_file(input_path)\n",
+    "\n",
+    "# Save the modified dictionary with tokenized captions and attention masks to a JSON file\n",
+    "output_path = '/mnt/nvme/shared_A/datasets/flickr30k/data/captions_tokenized.json'\n",
+    "with open(output_path, 'w') as json_file:\n",
+    "    json.dump(captions_dict, json_file)\n",
+    "\n",
+    "# Display the maximum token length\n",
+    "print(f\"Final maximum token length across dataset: {max_length}\")\n",
+    "\n",
+    "# Display the first few entries to verify the content\n",
+    "for jpg, captions in list(captions_dict.items())[:5]:\n",
+    "    print(f\"{jpg}: {captions}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Save the dictionary to a JSON file\n",
+    "output_path = '/mnt/nvme/shared_A/datasets/flickr30k/data/captions_dict.json'\n",
+    "with open(output_path, 'w') as json_file:\n",
+    "    json.dump(captions_dict, json_file)\n",
+    "\n",
+    "print(f\"Captions dictionary saved to {output_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import os\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import random\n",
+    "\n",
+    "\n",
+    "# Vision Caption Dataset\n",
+    "class VisionCaptionDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, captions_path, embeddings_dir, normalize=True):\n",
+    "        with open(captions_path, 'r') as f:\n",
+    "            self.captions_dict = json.load(f)\n",
+    "\n",
+    "        self.embeddings_dir = embeddings_dir\n",
+    "        self.image_ids = list(self.captions_dict.keys())\n",
+    "        self.normalize = normalize\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.image_ids)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        image_id = self.image_ids[idx]\n",
+    "        \n",
+    "        # Randomly select a caption and load the tokenized version\n",
+    "        caption_entry = random.choice(self.captions_dict[image_id])\n",
+    "        tokenized_caption = caption_entry[\"tokenized\"]\n",
+    "        attention_mask = caption_entry[\"attention_mask\"]\n",
+    "\n",
+    "        # Load vision embedding\n",
+    "        embedding_path = os.path.join(self.embeddings_dir, f\"{image_id}.npy\")\n",
+    "        embedding = np.load(embedding_path)\n",
+    "\n",
+    "        # Convert vision embedding and tokenized caption to tensors\n",
+    "        embedding = torch.tensor(embedding, dtype=torch.float32)\n",
+    "        tokenized_caption = torch.tensor(tokenized_caption, dtype=torch.long)\n",
+    "        attention_mask = torch.tensor(attention_mask, dtype=torch.long)\n",
+    "\n",
+    "        return embedding, tokenized_caption, attention_mask\n",
+    "\n",
+    "# Example usage\n",
+    "# Paths for dataset\n",
+    "captions_path = '/mnt/nvme/shared_A/datasets/flickr30k/data/captions_tokenized.json'\n",
+    "embeddings_dir = '/mnt/nvme/shared_A/datasets/flickr30k/data/reduced_vision_embeddings'\n",
+    "\n",
+    "# Initialize the dataset and split it into train/validation sets\n",
+    "full_dataset = VisionCaptionDataset(captions_path, embeddings_dir)\n",
+    "\n",
+    "# Initialize the DataLoaders with `num_workers` and `pin_memory`\n",
+    "train_dataloader = DataLoader(full_dataset, batch_size=16, shuffle=True, num_workers=8, pin_memory=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify a batch\n",
+    "for batch in train_dataloader:\n",
+    "    embeddings, captions, attn_mask = batch\n",
+    "    print(embeddings.shape, len(captions))\n",
+    "    \n",
+    "\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hf-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

_dataset/preprocess_images.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import os
+import shutil
+from PIL import Image
+from transformers.image_utils import load_image
+import sys
+sys.path.append('..')
+from vision_encoder import ideficsV3
+from tqdm import tqdm
+class VisionPreprocessor:
+    def __init__(self, device=None, param_dtype=torch.float32):
+        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
+        self.param_dtype = param_dtype
+        # Initialize and freeze the vision encoder
+        self.vision_encoder = ideficsV3("HuggingFaceTB/SmolVLM-Instruct").eval().to(self.device)
+        for param in self.vision_encoder.parameters():
+            param.requires_grad = False
+    def load_image(self, image_path):
+        """Load an image using PIL without preprocessing."""
+        image = load_image(image_path)
+        # Convert to tensor without resizing or additional transformations
+        inputs = self.vision_encoder.image_processor(images=[image], return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(self.param_dtype).to(self.device)
+        return pixel_values
+    def extract_embedding(self, image_tensor):
+        """Extract raw vision embedding."""
+        with torch.no_grad():
+            vision_output = self.vision_encoder(image_tensor)
+        vision_output = vision_output.mean(axis=0)
+        return vision_output
+    def save_embedding(self, vision_output, file_path):
+        """Save the vision output to a numpy file."""
+        np.save(file_path, vision_output.cpu().numpy())
+    def process_directory(self, image_paths, output_dir):
+        """Process all images in a directory with a progress bar and save the embeddings."""
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+            print(f"Existing directory cleared: {output_dir}")
+        os.makedirs(output_dir, exist_ok=True)
+        # Adding tqdm for progress bar
+        for image_path in tqdm(image_paths, desc="Processing Images", unit="image"):
+            # Load and extract features without preprocessing
+            image_tensor = self.load_image(image_path)
+            vision_output = self.extract_embedding(image_tensor)
+            # Save the output with the same filename but as a .npy
+            output_file_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(image_path))[0]}.npy")
+            self.save_embedding(vision_output, output_file_path)
+if __name__ == "__main__":
+    torch.manual_seed(42)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    param_dtype = torch.float32
+    # Instantiate the pipeline
+    pipeline = VisionPreprocessor(device, param_dtype)
+    # Specify input and output directories
+    input_directory = "/mnt/nvme/shared_A/datasets/flickr30k/data/flickr30k-images"
+    output_directory = "/mnt/nvme/shared_A/datasets/flickr30k/data/vision_embeddings_reduced2"
+    image_paths = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
+    # Process all images in the input directory
+    pipeline.process_directory(image_paths, output_directory)
+    print("Processing complete!")

demo.ipynb ADDED Viewed

	@@ -0,0 +1,240 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Image search with modernBERT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from _dataset.preprocess_images import *\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "pipeline = VisionPreprocessor(device, param_dtype=torch.float32)\n",
+    "\n",
+    "num_images = 25\n",
+    "input_directory = \"/mnt/nvme/shared_A/datasets/coco-image-caption/versions/1/val2017/val2017\"\n",
+    "image_paths = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]\n",
+    "\n",
+    "# Shuffle and take the first 25 images\n",
+    "# random.shuffle(image_paths)\n",
+    "image_paths = image_paths[:num_images]\n",
+    "\n",
+    "# Print the selected image paths\n",
+    "print(\"Selected Image Paths:\")\n",
+    "for path in image_paths:\n",
+    "    print(path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "\n",
+    "# Specify the output directory\n",
+    "output_directory = \"/mnt/nvme/shared_A/datasets/coco-image-caption/versions/1/val2017/vision_embeddings\"\n",
+    "\n",
+    "# Clear the vision embeddings directory if it exists, otherwise create it\n",
+    "if os.path.exists(output_directory):\n",
+    "    shutil.rmtree(output_directory)\n",
+    "    print(f\"Existing directory cleared: {output_directory}\")\n",
+    "os.makedirs(output_directory, exist_ok=True)\n",
+    "\n",
+    "# Process all images in the input directory\n",
+    "pipeline.process_directory(image_paths, output_directory)\n",
+    "print(\"Image embeddings saved!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from train import JointNetwork\n",
+    "\n",
+    "def load_checkpoint_and_prepare_model(checkpoint_path, device=\"cuda\"):\n",
+    "    \"\"\"Load trained JointNetwork() from checkpoint\"\"\"\n",
+    "    device = torch.device(device)\n",
+    "    model = JointNetwork()\n",
+    "    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)\n",
+    "    model.load_state_dict(checkpoint['model_state_dict'])\n",
+    "    model.to(device)\n",
+    "    model.eval()\n",
+    "    model.device = device\n",
+    "    print(f\"Model loaded successfully from {checkpoint_path}.\")\n",
+    "    return model\n",
+    "\n",
+    "def get_text_embedding(model, text_prompt):\n",
+    "    \"\"\"Encode a text prompt to get its embedding using the modernBERT encoder.\"\"\"\n",
+    "    tokenized_text = model.text_encoder.tokenizer(text_prompt, return_tensors=\"pt\").to(model.device)\n",
+    "    with torch.no_grad():\n",
+    "        text_features = model.text_encoder(tokenized_text)\n",
+    "        text_features = model.text_projector(text_features.mean(dim=1))\n",
+    "        text_features = F.normalize(text_features, dim=1)\n",
+    "    return text_features\n",
+    "\n",
+    "def load_image_embeddings(model, embeddings_dir):\n",
+    "    \"\"\"Load all precomputed image embeddings from the specified directory.\"\"\"\n",
+    "    vision_embeddings = []\n",
+    "    for file in sorted(os.listdir(embeddings_dir)):\n",
+    "        if file.endswith(\".npy\"):\n",
+    "            image_encoding = torch.tensor(np.load(os.path.join(embeddings_dir, file)), dtype=torch.float32).to(model.device)\n",
+    "            vision_pooled = image_encoding.mean(dim=0).unsqueeze(0)\n",
+    "            vision_embedded = model.vision_projector(vision_pooled)\n",
+    "            vision_embedded = F.normalize(vision_embedded, dim=1)\n",
+    "            vision_embeddings.append(vision_embedded)\n",
+    "    \n",
+    "    if len(vision_embeddings) == 0:\n",
+    "        raise ValueError(\"No vision embeddings found in the specified directory.\")\n",
+    "    print(f\"Vision embeddings loaded successfully from {embeddings_dir}.\")\n",
+    "    return torch.stack(vision_embeddings).squeeze(1)\n",
+    "\n",
+    "def compare_text_to_images(text_embedding, vision_embeddings):\n",
+    "    \"\"\"Compare a text embedding against a batch of image embeddings using cosine similarity.\"\"\"\n",
+    "    cosine_similarities = torch.matmul(text_embedding, vision_embeddings.T).squeeze(0)\n",
+    "    similarity_scores = cosine_similarities.cpu().detach().numpy()\n",
+    "    ranked_indices = similarity_scores.argsort()[::-1]  # Sort in descending order\n",
+    "    return ranked_indices, similarity_scores\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Paths and settings\n",
+    "checkpoint_path = \"/home/nolan4/projects/hf-contest/checkpoints/model_checkpoint_20250109_102039.pth\"\n",
+    "embeddings_dir = \"/mnt/nvme/shared_A/datasets/coco-image-caption/versions/1/val2017/vision_embeddings\"\n",
+    "\n",
+    "# Load the model and precomputed vision embeddings\n",
+    "model = load_checkpoint_and_prepare_model(checkpoint_path)\n",
+    "vision_embeddings = load_image_embeddings(model, embeddings_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from PIL import Image\n",
+    "\n",
+    "def display_images_from_paths(image_paths, num_images=5):\n",
+    "\n",
+    "    num_images = min(num_images, len(image_paths))\n",
+    "    if num_images == 0:\n",
+    "        print(\"No images found in the directory.\")\n",
+    "        return\n",
+    "\n",
+    "    plt.figure(figsize=(12, 8))\n",
+    "    for i, image_path in enumerate(image_paths[:num_images]):\n",
+    "        img = Image.open(image_path)\n",
+    "        plt.subplot(1, num_images, i + 1)\n",
+    "        plt.imshow(img)\n",
+    "        plt.axis('off')  \n",
+    "        plt.title(f\"{os.path.basename(image_path).split('.')[0]}\")\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "# Example usage\n",
+    "# random.shuffle(image_paths)\n",
+    "display_images_from_paths(image_paths, num_images=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Paths and settings\n",
+    "text_prompt = \"cars driving down the road\"\n",
+    "# text_prompt = \"stuffed brown teddy bear\"\n",
+    "\n",
+    "\n",
+    "# Load the model and embeddings\n",
+    "text_embedding = get_text_embedding(model, text_prompt)\n",
+    "\n",
+    "# Perform comparison and display results\n",
+    "ranked_indices, similarity_scores = compare_text_to_images(text_embedding, vision_embeddings)\n",
+    "print(f\"\\nTop 5 Most Similar Images:\")\n",
+    "for idx in ranked_indices[:5]:\n",
+    "    print(f\"Image Index: {idx}, Similarity Score: {similarity_scores[idx]:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure ranked_indices is converted to a Python list\n",
+    "selected_image_paths = [image_paths[idx] for idx in ranked_indices[:10]]\n",
+    "\n",
+    "# Display the top N ranked images\n",
+    "display_images_from_paths(selected_image_paths, num_images=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hf-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

text_encoder.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from transformers import AutoTokenizer, ModernBertModel
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import pdb
+class modernBERT(nn.Module):
+    def __init__(self, model_name="answerdotai/ModernBERT-base"):
+        super(modernBERT, self).__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.bert = ModernBertModel.from_pretrained(model_name)
+    def forward(self, inputs):
+        # inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+        outputs = self.bert(**inputs)
+        return outputs.last_hidden_state # logits
+# Example training loop
+if __name__ == "__main__":
+    model = modernBERT("answerdotai/ModernBERT-base")
+    texts = ["Potato's no name for a dog"]
+    text_inputs = {"input_ids": model.tokenizer(texts)}
+    output = model(text_inputs)
+    print(output[0].shape)

train.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, random_split
+from text_encoder import *
+from vision_encoder import *
+import os
+import json
+import numpy as np
+import random
+from tqdm import tqdm
+import datetime
+# Vision Caption Dataset
+class VisionCaptionDataset(torch.utils.data.Dataset):
+    def __init__(self, captions_path, embeddings_dir, normalize=True):
+        with open(captions_path, 'r') as f:
+            self.captions_dict = json.load(f)
+        self.embeddings_dir = embeddings_dir
+        self.image_ids = list(self.captions_dict.keys())
+        self.normalize = normalize
+    def __len__(self):
+        return len(self.image_ids)
+    def __getitem__(self, idx):
+        image_id = self.image_ids[idx]
+        caption_entry = random.choice(self.captions_dict[image_id])
+        tokenized_caption = caption_entry["tokenized"]
+        attention_mask = caption_entry["attention_mask"]
+        embedding_path = os.path.join(self.embeddings_dir, f"{image_id}.npy")
+        embedding = np.load(embedding_path)
+        embedding = torch.tensor(embedding, dtype=torch.float32)
+        tokenized_caption = torch.tensor(tokenized_caption, dtype=torch.long)
+        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
+        return embedding, tokenized_caption, attention_mask
+class JointNetwork(nn.Module):
+    def __init__(self):
+        super(JointNetwork, self).__init__()
+        self.text_encoder = modernBERT("answerdotai/ModernBERT-base")
+        for param in self.text_encoder.parameters():
+            param.requires_grad = True
+        self.vision_projector = nn.Linear(1152, 512)
+        self.text_projector = nn.Linear(768, 512)
+    def forward(self, tokenized_text, image_encoding):
+        vision_patch_pooled = image_encoding.mean(dim=1)
+        text_output = self.text_encoder(tokenized_text)
+        text_pooled = text_output.mean(dim=1)
+        vision_embedded = self.vision_projector(vision_patch_pooled)
+        text_embedded = self.text_projector(text_pooled)
+        vision_embedded = F.normalize(vision_embedded, dim=1)
+        text_embedded = F.normalize(text_embedded, dim=1)
+        return text_embedded, vision_embedded
+def infoNCE_loss(text_features, vision_features, temperature=0.07):
+    text_features = F.normalize(text_features, p=2, dim=-1)
+    vision_features = F.normalize(vision_features, p=2, dim=-1)
+    similarity_matrix = torch.matmul(text_features, vision_features.T) / temperature
+    batch_size = vision_features.size(0)
+    labels = torch.arange(batch_size, device=vision_features.device)
+    loss_text_to_image = F.cross_entropy(similarity_matrix, labels)
+    loss_image_to_text = F.cross_entropy(similarity_matrix.T, labels)
+    return (loss_text_to_image + loss_image_to_text) / 2
+def train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=5, freeze_text_encoder=True, checkpoint_path=None):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    best_val_loss = float('inf')  # Initialize with a very high value
+    # Freeze text encoder if specified
+    if freeze_text_encoder:
+        for param in model.text_encoder.parameters():
+            param.requires_grad = False
+    # Ensure new layers are trainable
+    for param in model.vision_projector.parameters():
+        param.requires_grad = True
+    for param in model.text_projector.parameters():
+        param.requires_grad = True
+    model.to(device)
+    for epoch in range(num_epochs):
+        # Train loop
+        model.train()
+        total_loss = 0.0
+        print(f"\nEpoch {epoch + 1}/{num_epochs} - Training:")
+        train_progress = tqdm(train_loader, desc="Training", leave=True)
+        for image_embeddings, tokenized_captions, attention_masks in train_progress:
+            text_inputs = {"input_ids": tokenized_captions.to(device), "attention_mask": attention_masks.to(device)}
+            image_embeddings = image_embeddings.to(device)
+            optimizer.zero_grad()
+            text_features, vision_features = model(text_inputs, image_embeddings)
+            loss = infoNCE_loss(text_features, vision_features)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            train_progress.set_postfix(loss=loss.item())
+        scheduler.step()
+        # Validation Loop
+        model.eval()
+        val_loss = 0.0
+        print(f"\nEpoch {epoch + 1}/{num_epochs} - Validation:")
+        val_progress = tqdm(val_loader, desc="Validation", leave=True)
+        with torch.no_grad():
+            for image_embeddings, tokenized_captions, attention_masks in val_progress:
+                text_inputs = {"input_ids": tokenized_captions.to(device), "attention_mask": attention_masks.to(device)}
+                image_embeddings = image_embeddings.to(device)
+                text_features, vision_features = model(text_inputs, image_embeddings)
+                loss = infoNCE_loss(text_features, vision_features)
+                val_loss += loss.item()
+                val_progress.set_postfix(loss=loss.item())
+        avg_train_loss = total_loss / len(train_loader)
+        avg_val_loss = val_loss / len(val_loader)
+        print(f"\nEpoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
+        # Save best model
+        if checkpoint_path is not None:
+            if avg_val_loss < best_val_loss:
+                best_val_loss = avg_val_loss
+                torch.save({
+                    'epoch': epoch + 1,
+                    'model_state_dict': model.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'val_loss': best_val_loss
+                }, checkpoint_path)
+                print(f"New Best Model Saved at: {checkpoint_path} (Val Loss: {best_val_loss:.4f})")
+    print("Training completed!")
+if __name__ == "__main__":
+    # Set random seed for reproducibility
+    # torch.manual_seed(42)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Paths for dataset
+    captions_path = '/mnt/nvme/shared_A/datasets/flickr30k/data/captions_tokenized.json'
+    # embeddings_dir = '/mnt/nvme/shared_A/datasets/flickr30k/data/reduced_vision_embeddings'
+    embeddings_dir = '/mnt/nvme/shared_A/datasets/flickr30k/data/vision_embeddings_reduced2'
+    # Initialize datasets and loaders
+    full_dataset = VisionCaptionDataset(captions_path, embeddings_dir)
+    train_size = int(0.85 * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
+    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=8, pin_memory=True)
+    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=8, pin_memory=True)
+    # Initialize model, optimizer, and scheduler
+    model = JointNetwork().to(device)
+    checkpoint_path = f"./checkpoints/model_checkpoint_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pth"
+    # **Phase 1 Configuration: Training new layers only**
+    initial_lr = 1e-4
+    min_lr = 1e-6
+    num_epochs = 16
+    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=initial_lr)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=min_lr)
+    # **Phase 1: Train new layers only, freeze text encoder**
+    print("\n### Phase 1: Training new layers only (Text Encoder Frozen) ###")
+    train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=num_epochs, freeze_text_encoder=True, checkpoint_path=checkpoint_path)
+    # # **Phase 2 Configuration: Fine-tuning with adjusted learning rate**
+    # initial_lr = 1e-4
+    # min_lr = 1e-6
+    # num_epochs = 3
+    # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=initial_lr)
+    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=min_lr)
+    # print("\n### Phase 2: Fine-tuning text encoder and new layers ###")
+    # train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=num_epochs, freeze_text_encoder=False, checkpoint_path=checkpoint_path)

vision_encoder.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import torch.nn as nn
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+class ideficsV3(nn.Module):
+    def __init__(self, model_name="HuggingFaceTB/SmolVLM-Instruct"):
+        super().__init__()
+        # load smolVLM model from huggingface
+        self.image_processor = AutoProcessor.from_pretrained(model_name).image_processor
+        smolVLM = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float32)
+        # Extract the necessary modules
+        self.vision_model = smolVLM.model.vision_model
+    def forward(self, pixel_values):
+        #################################################################
+        # The error ValueError: too many values to unpack (expected 4) occurs because the pixel_values tensor you passed into the model has a shape of [1, 13, 3, 384, 384], while the vision transformer (ViT) expects an input shape of [batch_size, channels, height, width], i.e., a 4D tensor.
+        # Your pixel_values tensor is 5D because it contains multiple patches, while the ViT expects a single image or batch of images.
+        # You need to flatten the patch dimension (the second dimension, 13) into the batch dimension (1) before passing it to the vision transformer.
+        # Flatten the patch dimension into the batch dimension
+        batch_size, num_patches, channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.view(batch_size * num_patches, channels, height, width)
+        #################################################################
+        # Run images through the vision transformer
+        vision_outputs = self.vision_model(pixel_values)
+        x = vision_outputs.last_hidden_state # shape := [batch_size * num_patches, 729, 1152]
+        return x
+if __name__ == "__main__":
+    # Instantiate truncated model
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    truncated_model = ideficsV3().to(device).eval()
+    truncated_model.eval()
+    image1 = load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg")
+    image2 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
+    inputs1 = truncated_model.image_processor(images=[image1, image2], return_tensors="pt")
+    pixel_values = inputs1.pixel_values.to(model_dtype).to(device)
+    # Pass pixel_values through your truncated model
+    with torch.no_grad():
+        outputs = truncated_model(pixel_values)
+    print(outputs.shape)  # Should be [batch_size, 2048] given the projection layer output.