modernBERT-base-CLIP / _dataset /preprocess_images.py

initial commit

8a00d0d about 2 months ago

3.23 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	import os
	import shutil
	from PIL import Image
	from transformers.image_utils import load_image
	import sys
	sys.path.append('..')
	from vision_encoder import ideficsV3
	from tqdm import tqdm

	class VisionPreprocessor:
	def __init__(self, device=None, param_dtype=torch.float32):
	self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
	self.param_dtype = param_dtype

	# Initialize and freeze the vision encoder
	self.vision_encoder = ideficsV3("HuggingFaceTB/SmolVLM-Instruct").eval().to(self.device)
	for param in self.vision_encoder.parameters():
	param.requires_grad = False

	def load_image(self, image_path):
	"""Load an image using PIL without preprocessing."""
	image = load_image(image_path)
	# Convert to tensor without resizing or additional transformations
	inputs = self.vision_encoder.image_processor(images=[image], return_tensors="pt")
	pixel_values = inputs.pixel_values.to(self.param_dtype).to(self.device)
	return pixel_values

	def extract_embedding(self, image_tensor):
	"""Extract raw vision embedding."""
	with torch.no_grad():
	vision_output = self.vision_encoder(image_tensor)

	vision_output = vision_output.mean(axis=0)

	return vision_output

	def save_embedding(self, vision_output, file_path):
	"""Save the vision output to a numpy file."""
	np.save(file_path, vision_output.cpu().numpy())

	def process_directory(self, image_paths, output_dir):
	"""Process all images in a directory with a progress bar and save the embeddings."""
	if os.path.exists(output_dir):
	shutil.rmtree(output_dir)
	print(f"Existing directory cleared: {output_dir}")
	os.makedirs(output_dir, exist_ok=True)

	# Adding tqdm for progress bar
	for image_path in tqdm(image_paths, desc="Processing Images", unit="image"):

	# Load and extract features without preprocessing
	image_tensor = self.load_image(image_path)
	vision_output = self.extract_embedding(image_tensor)

	# Save the output with the same filename but as a .npy
	output_file_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(image_path))[0]}.npy")
	self.save_embedding(vision_output, output_file_path)


	if __name__ == "__main__":
	torch.manual_seed(42)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	param_dtype = torch.float32

	# Instantiate the pipeline
	pipeline = VisionPreprocessor(device, param_dtype)

	# Specify input and output directories
	input_directory = "/mnt/nvme/shared_A/datasets/flickr30k/data/flickr30k-images"
	output_directory = "/mnt/nvme/shared_A/datasets/flickr30k/data/vision_embeddings_reduced2"

	image_paths = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
	# Process all images in the input directory
	pipeline.process_directory(image_paths, output_directory)
	print("Processing complete!")