#!/usr/bin/env python3 from transformers.utils import logging logging.set_verbosity_error() from transformers import BlipForImageTextRetrieval from transformers import AutoProcessor from PIL import Image import math, random, time # import random # import time import torch # multi-modal Model # accepts both text and image content (or audio, etc.) print("loading model ...") model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco") processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco") print("loading image ...") raw_image = Image.open('./assets/pot-o-gold-my-little-pony-Derpy.jpeg').convert('RGB') print("processing ...") statements = [ "an image of a horse", "a horse and a rainbow", "a pony and a rainbow", "a unicorn and a rainbow", "a pony in a forest", "a rainbox over a lake", "a horse running through the forest", "two eyes that do not match", "equine joy", "a stallion and gold coins", "a mare and gold coins" ] while True: index = math.floor(random.random() * len(statements)) text = statements[index] inputs = processor(images=raw_image, text=text, return_tensors="pt") # PyTorch tensors itm_scores = model(**inputs)[0] itm_score = torch.nn.functional.softmax(itm_scores, dim=1) print(f"""'{text}' => {itm_score[0][1]:.2f}""")