|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
import numpy as np |
|
import os |
|
import shutil |
|
from PIL import Image |
|
from transformers.image_utils import load_image |
|
import sys |
|
sys.path.append('..') |
|
from vision_encoder import ideficsV3 |
|
from tqdm import tqdm |
|
|
|
class VisionPreprocessor: |
|
def __init__(self, device=None, param_dtype=torch.float32): |
|
self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu") |
|
self.param_dtype = param_dtype |
|
|
|
|
|
self.vision_encoder = ideficsV3("HuggingFaceTB/SmolVLM-Instruct").eval().to(self.device) |
|
for param in self.vision_encoder.parameters(): |
|
param.requires_grad = False |
|
|
|
def load_image(self, image_path): |
|
"""Load an image using PIL without preprocessing.""" |
|
image = load_image(image_path) |
|
|
|
inputs = self.vision_encoder.image_processor(images=[image], return_tensors="pt") |
|
pixel_values = inputs.pixel_values.to(self.param_dtype).to(self.device) |
|
return pixel_values |
|
|
|
def extract_embedding(self, image_tensor): |
|
"""Extract raw vision embedding.""" |
|
with torch.no_grad(): |
|
vision_output = self.vision_encoder(image_tensor) |
|
|
|
vision_output = vision_output.mean(axis=0) |
|
|
|
return vision_output |
|
|
|
def save_embedding(self, vision_output, file_path): |
|
"""Save the vision output to a numpy file.""" |
|
np.save(file_path, vision_output.cpu().numpy()) |
|
|
|
def process_directory(self, image_paths, output_dir): |
|
"""Process all images in a directory with a progress bar and save the embeddings.""" |
|
if os.path.exists(output_dir): |
|
shutil.rmtree(output_dir) |
|
print(f"Existing directory cleared: {output_dir}") |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
for image_path in tqdm(image_paths, desc="Processing Images", unit="image"): |
|
|
|
|
|
image_tensor = self.load_image(image_path) |
|
vision_output = self.extract_embedding(image_tensor) |
|
|
|
|
|
output_file_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(image_path))[0]}.npy") |
|
self.save_embedding(vision_output, output_file_path) |
|
|
|
|
|
if __name__ == "__main__": |
|
torch.manual_seed(42) |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
param_dtype = torch.float32 |
|
|
|
|
|
pipeline = VisionPreprocessor(device, param_dtype) |
|
|
|
|
|
input_directory = "/mnt/nvme/shared_A/datasets/flickr30k/data/flickr30k-images" |
|
output_directory = "/mnt/nvme/shared_A/datasets/flickr30k/data/vision_embeddings_reduced2" |
|
|
|
image_paths = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.lower().endswith(('.png', '.jpg', '.jpeg'))] |
|
|
|
pipeline.process_directory(image_paths, output_directory) |
|
print("Processing complete!") |
|
|