Image embedding example
#1
by
RASMUS
- opened
If I were to precalculate 100s of image embeddings with this and then do text search.
Can you add some example how to do it?
Something along these lines:
from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
import torch
model = AutoModel.from_pretrained("visheratin/mexma-siglip2", torch_dtype=torch.bfloat16, trust_remote_code=True, optimized=True).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("visheratin/mexma-siglip2")
processor = AutoImageProcessor.from_pretrained("visheratin/mexma-siglip2")
images = [
...
]
batch_size = 200
image_embeddings = None
with torch.inference_mode():
for i in range(0, len(images), batch_size):
batch_images = images[i:i+batch_size]
batch_images = processor(images=batch_images, return_tensors="pt")["pixel_values"]
batch_images = batch_images.to(torch.bfloat16).to("cuda")
batch_image_embeddings = model.encode_images(batch_images)
if image_embeddings is None:
image_embeddings = batch_image_embeddings
else:
image_embeddings = torch.cat([image_embeddings, batch_image_embeddings], dim=0)
text = tokenizer(["your search query"], return_tensors="pt", padding=True).to("cuda")
text_embeddings = model.encode_texts(text["input_ids"], text["attention_mask"])
similarity = image_embeddings @ text_embeddings.T
print(similarity)
RASMUS
changed discussion status to
closed