{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/ritesh.thawkar/Ritesh/nutrigenics/nutrigenics-chatbot/chatbot-env/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", " warnings.warn(\n", "/Users/ritesh.thawkar/Ritesh/nutrigenics/nutrigenics-chatbot/chatbot-env/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import pandas as pd\n", "import json\n", "from PIL import Image\n", "import numpy as np\n", "import gradio as gr " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "\n", "from src.data.embs import ImageDataset\n", "from src.model.blip_embs import blip_embs" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from src.data.transforms import transform_test\n", "#\n", "transform = transform_test(384)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "import json \n", "import numpy as np \n", "from PIL import Image\n", "import torch.nn.functional as F\n", "import torch\n", "from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer\n", "\n", "\n", "\n", "class StoppingCriteriaSub(StoppingCriteria):\n", "\n", " def __init__(self, stops=[], encounters=1):\n", " super().__init__()\n", " self.stops = stops\n", "\n", " def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):\n", " for stop in self.stops:\n", " if torch.all(input_ids[:, -len(stop):] == stop).item():\n", " return True\n", "\n", " return False\n", "\n", "\n", "\n", "class Chat:\n", "\n", " def __init__(self, model, transform, dataframe, tar_img_feats, device='cuda:0', stopping_criteria=None):\n", " self.device = device\n", " self.model = model\n", " self.transform = transform\n", " self.df = dataframe\n", " self.tar_img_feats = tar_img_feats\n", " self.img_feats = None\n", " self.target_recipe = None\n", " self.messages = []\n", "\n", " if stopping_criteria is not None:\n", " self.stopping_criteria = stopping_criteria\n", " else:\n", " stop_words_ids = [torch.tensor([2]).to(self.device)]\n", " self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])\n", "\n", " def encode_image(self, image_path):\n", " img = Image.fromarray(image_path).convert(\"RGB\")\n", " img = self.transform(img).unsqueeze(0)\n", " img = img.to(self.device)\n", " img_embs = self.model.visual_encoder(img)\n", " img_feats = F.normalize(self.model.vision_proj(img_embs[:, 0, :]), dim=-1).cpu()\n", "\n", " self.img_feats = img_feats \n", "\n", " self.get_target(self.img_feats, self.tar_img_feats)\n", "\n", " def get_target(self, img_feats, tar_img_feats) : \n", " score = (img_feats @ tar_img_feats.t()).squeeze(0).cpu().detach().numpy()\n", " index = np.argsort(score)[::-1][0]\n", " print(index)\n", " self.target_recipe = self.df.iloc[index]\n", "\n", " def ask(self, msg):\n", " if \"nutrition\" in msg or \"nutrients\" in msg : \n", " return json.dumps(self.target_recipe[\"recipe_nutrients\"], indent=4)\n", " elif \"instruction\" in msg :\n", " return json.dumps(self.target_recipe[\"recipe_instructions\"], indent=4)\n", " elif \"ingredients\" in msg :\n", " return json.dumps(self.target_recipe[\"recipe_ingredients\"], indent=4)\n", " elif \"tag\" in msg or \"class\" in msg :\n", " return json.dumps(self.target_recipe[\"tags\"], indent=4)\n", " else:\n", " return \"Conversational capabilities will be included later.\"\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_blip_config(model=\"base\"):\n", " config = dict()\n", " if model == \"base\":\n", " config[\n", " \"pretrained\"\n", " ] = \"https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth \"\n", " config[\"vit\"] = \"base\"\n", " config[\"batch_size_train\"] = 32\n", " config[\"batch_size_test\"] = 16\n", " config[\"vit_grad_ckpt\"] = True\n", " config[\"vit_ckpt_layer\"] = 4\n", " config[\"init_lr\"] = 1e-5\n", " elif model == \"large\":\n", " config[\n", " \"pretrained\"\n", " ] = \"https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth\"\n", " config[\"vit\"] = \"large\"\n", " config[\"batch_size_train\"] = 16\n", " config[\"batch_size_test\"] = 32\n", " config[\"vit_grad_ckpt\"] = True\n", " config[\"vit_ckpt_layer\"] = 12\n", " config[\"init_lr\"] = 5e-6\n", "\n", " config[\"image_size\"] = 384\n", " config[\"queue_size\"] = 57600\n", " config[\"alpha\"] = 0.4\n", " config[\"k_test\"] = 256\n", " config[\"negative_all_rank\"] = True\n", "\n", " return config" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Creating model\n", "load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth\n", "missing keys:\n", "[]\n" ] }, { "data": { "text/plain": [ "BLIPEmbs(\n", " (visual_encoder): VisionTransformer(\n", " (patch_embed): PatchEmbed(\n", " (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))\n", " (norm): Identity()\n", " )\n", " (pos_drop): Dropout(p=0.0, inplace=False)\n", " (blocks): ModuleList(\n", " (0): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (1): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.004)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (2): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.009)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (3): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.013)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (4): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.017)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (5): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.022)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (6): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.026)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (7): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.030)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (8): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.035)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (9): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.039)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (10): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.043)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (11): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.048)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (12): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.052)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (13): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.057)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (14): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.061)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (15): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.065)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (16): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.070)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (17): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.074)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (18): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.078)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (19): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.083)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (20): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.087)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (21): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.091)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (22): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.096)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (23): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (drop_path): DropPath(drop_prob=0.100)\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " )\n", " (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " )\n", " (text_encoder): BertModel(\n", " (embeddings): BertEmbeddings(\n", " (word_embeddings): Embedding(30524, 768, padding_idx=0)\n", " (position_embeddings): Embedding(512, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (encoder): BertEncoder(\n", " (layer): ModuleList(\n", " (0-11): 12 x BertLayer(\n", " (attention): BertAttention(\n", " (self): BertSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): BertSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (crossattention): BertAttention(\n", " (self): BertSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=1024, out_features=768, bias=True)\n", " (value): Linear(in_features=1024, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): BertSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): BertIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): BertOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " )\n", " (vision_proj): Linear(in_features=1024, out_features=256, bias=True)\n", " (text_proj): Linear(in_features=768, out_features=256, bias=True)\n", ")" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Creating model\")\n", "config = get_blip_config(\"large\")\n", "\n", "model = blip_embs(\n", " pretrained=config[\"pretrained\"],\n", " image_size=config[\"image_size\"],\n", " vit=config[\"vit\"],\n", " vit_grad_ckpt=config[\"vit_grad_ckpt\"],\n", " vit_ckpt_layer=config[\"vit_ckpt_layer\"],\n", " queue_size=config[\"queue_size\"],\n", " negative_all_rank=config[\"negative_all_rank\"],\n", " )\n", "\n", "model = model.to(device)\n", "model.eval()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df = pd.read_json(\"datasets/sidechef/my_recipes.json\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | recipe_name | \n", "recipe_time | \n", "recipe_yields | \n", "recipe_ingredients | \n", "recipe_instructions | \n", "recipe_image | \n", "blogger | \n", "recipe_nutrients | \n", "tags | \n", "id_ | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "Asian Potato Salad with Seven Minute Egg | \n", "0 | \n", "4 servings | \n", "[2 1/2 cup Multi-Colored Fingerling Potato, 3/... | \n", "Fill a large stock pot with water.\\nAdd the Mu... | \n", "https://www.sidechef.com/recipe/eeeeeceb-493e-... | \n", "sidechef.com | \n", "{'calories': '80 calories', 'proteinContent': ... | \n", "[Salad, Lunch, Brunch, Appetizers, Side Dish, ... | \n", "1 | \n", "
1 | \n", "Everything Breakfast Bombs | \n", "0 | \n", "8 servings | \n", "[5 tablespoon Butter, 12 ounce Turkey Breakfas... | \n", "First, preheat the oven to 375 degrees F (190 ... | \n", "https://www.sidechef.com/recipe/525f6843-4337-... | \n", "sidechef.com | \n", "{'calories': '56 calories', 'proteinContent': ... | \n", "[Breakfast, Brunch, Low-Carb, Eggs, American, ... | \n", "2 | \n", "
2 | \n", "Bacon Swiss Deviled Eggs | \n", "0 | \n", "6 servings | \n", "[6 Egg, 1/4 cup Mayonnaise, 1/4 cup Avocado, 1... | \n", "Cut each hard boiled Egg (6) in half lengthwis... | \n", "https://www.sidechef.com/recipe/2075e8cf-4fa9-... | \n", "sidechef.com | \n", "{'calories': '38 calories', 'proteinContent': ... | \n", "[Breakfast, Brunch, Low-Carb, Eggs, American, ... | \n", "3 | \n", "
3 | \n", "Farmers Market Breakfast Pizza | \n", "0 | \n", "2 servings | \n", "[1/2 Pizza Dough, 1/2 cup Kale, 1/2 cup Onion,... | \n", "For homemade pizza sauce, finely chop the Swee... | \n", "https://www.sidechef.com/recipe/1cd15944-9411-... | \n", "sidechef.com | \n", "{'calories': '315 calories', 'proteinContent':... | \n", "[Breakfast, Brunch, Main Dish, Budget-Friendly... | \n", "4 | \n", "
4 | \n", "Scrambled Eggs | \n", "0 | \n", "2 servings | \n", "[3 Egg, 2 tablespoon Heavy Cream, 2 tablespoon... | \n", "Crack Egg (3) into a bowl.\\nPour in Heavy Crea... | \n", "https://www.sidechef.com/recipe/08d39a01-c030-... | \n", "sidechef.com | \n", "{'calories': '127 calories', 'proteinContent':... | \n", "[Breakfast, Brunch, Vegetarian, Low-Carb, Pesc... | \n", "5 | \n", "
5 | \n", "Fettuccini Carbonara | \n", "0 | \n", "2 servings | \n", "[2 Shallot, 1 clove Garlic, 2 Egg, 6 slice Bac... | \n", "Put a generously salted pot of water on to boi... | \n", "https://www.sidechef.com/recipe/9e5df75f-bf1a-... | \n", "sidechef.com | \n", "{'calories': '495 calories', 'proteinContent':... | \n", "[Pasta, Dinner, Side Dish, Main Dish, Pork, Eg... | \n", "6 | \n", "
6 | \n", "Sausage Egg Muffins | \n", "0 | \n", "6 servings | \n", "[1 pound Ground Pork, 1 1/2 teaspoon Fresh Par... | \n", "Preheat your oven to 350 degrees F (175 degree... | \n", "https://www.sidechef.com/recipe/49d5e5a3-4d16-... | \n", "sidechef.com | \n", "{'calories': '44 calories', 'proteinContent': ... | \n", "[Keto, Breakfast, Brunch, Budget-Friendly, Low... | \n", "7 | \n", "
7 | \n", "Shakshuka | \n", "0 | \n", "4 servings | \n", "[1 tablespoon Oil, 3 Tomato, 1 Green Chili Pep... | \n", "Preheat oven to 180 degrees C (350 degrees F) ... | \n", "https://www.sidechef.com/recipe/de00577b-38d4-... | \n", "sidechef.com | \n", "{'calories': '99 calories', 'fatContent': '2.5... | \n", "[Breakfast, Brunch, Main Dish, Vegetarian, Pes... | \n", "8 | \n", "
8 | \n", "Huevos Rancheros | \n", "0 | \n", "1 serving | \n", "[2 Yellow Corn Tortilla, 2 tablespoon Pinto Be... | \n", "In a small frying pan, spray a little Nonstick... | \n", "https://www.sidechef.com/recipe/5284bc88-1305-... | \n", "sidechef.com | \n", "{'calories': '290 calories', 'proteinContent':... | \n", "[Breakfast, Brunch, Eggs, Quick, Mexican, Shel... | \n", "9 | \n", "
9 | \n", "Homemade Pasta | \n", "0 | \n", "4 servings | \n", "[1 cup All-Purpose Flour, 1 teaspoon Salt, 1 Egg] | \n", "Mix All-Purpose Flour (1 cup) and Salt (1 teas... | \n", "https://www.sidechef.com/recipe/8528a7af-b6d8-... | \n", "sidechef.com | \n", "{'calories': '33 calories', 'proteinContent': ... | \n", "[Pasta, Budget-Friendly, Vegetarian, Pescatari... | \n", "10 | \n", "