import torch import gradio as gr from transformers import CLIPProcessor, CLIPModel import spaces # Dictionary of available CLIP models with their image sizes CLIP_MODELS = { "ViT-B/32": ("openai/clip-vit-base-patch32", 224), "ViT-B/16": ("openai/clip-vit-base-patch16", 224), "ViT-L/14": ("openai/clip-vit-large-patch14", 224), "ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336), } # Initialize models and processors models = {} processors = {} for model_name, (model_path, _) in CLIP_MODELS.items(): models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda") processors[model_name] = CLIPProcessor.from_pretrained(model_path) @spaces.GPU def calculate_score(image, text, model_name): labels = text.split(";") labels = [l.strip() for l in labels] labels = list(filter(None, labels)) if len(labels) == 0: return dict() model = models[model_name] processor = processors[model_name] # Get the correct image size for the model _, image_size = CLIP_MODELS[model_name] # Preprocess the image and text inputs = processor(text=labels, images=image, return_tensors="pt", padding=True) inputs = {k: v.to("cuda") for k, v in inputs.items()} # Calculate scores with torch.no_grad(): outputs = model(**inputs) logits_per_image = outputs.logits_per_image.cpu().numpy() results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])} return results_dict with gr.Blocks() as demo: gr.Markdown("# Multi-Model CLIP Score") gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text using different CLIP model variants") with gr.Row(): image_input = gr.Image(type="pil") output_label = gr.Label() with gr.Row(): text_input = gr.Textbox(label="Descriptions (separated by semicolons)") model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16") def process_inputs(image, text, model_name): if image is None or text.strip() == "": return None return calculate_score(image, text, model_name) image_input.change( fn=process_inputs, inputs=[image_input, text_input, model_dropdown], outputs=output_label ) text_input.submit( fn=process_inputs, inputs=[image_input, text_input, model_dropdown], outputs=output_label ) model_dropdown.change( fn=process_inputs, inputs=[image_input, text_input, model_dropdown], outputs=output_label ) gr.Examples( examples=[ [ "cat.jpg", "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void", "ViT-B/16" ] ], fn=process_inputs, inputs=[image_input, text_input, model_dropdown], outputs=output_label, ) demo.launch()