CLIPScore / app.py
taesiri's picture
Update app.py
61b7eee verified
raw
history blame
3.1 kB
import torch
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces
# Dictionary of available CLIP models with their image sizes
CLIP_MODELS = {
"ViT-B/32": ("openai/clip-vit-base-patch32", 224),
"ViT-B/16": ("openai/clip-vit-base-patch16", 224),
"ViT-L/14": ("openai/clip-vit-large-patch14", 224),
"ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
}
# Initialize models and processors
models = {}
processors = {}
for model_name, (model_path, _) in CLIP_MODELS.items():
models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
processors[model_name] = CLIPProcessor.from_pretrained(model_path)
@spaces.GPU
def calculate_score(image, text, model_name):
labels = text.split(";")
labels = [l.strip() for l in labels]
labels = list(filter(None, labels))
if len(labels) == 0:
return dict()
model = models[model_name]
processor = processors[model_name]
# Get the correct image size for the model
_, image_size = CLIP_MODELS[model_name]
# Preprocess the image and text
inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Calculate scores
with torch.no_grad():
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image.cpu().numpy()
results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
return results_dict
with gr.Blocks() as demo:
gr.Markdown("# Multi-Model CLIP Score")
gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text using different CLIP model variants")
with gr.Row():
image_input = gr.Image(type="pil")
output_label = gr.Label()
with gr.Row():
text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
def process_inputs(image, text, model_name):
if image is None or text.strip() == "":
return None
return calculate_score(image, text, model_name)
image_input.change(
fn=process_inputs,
inputs=[image_input, text_input, model_dropdown],
outputs=output_label
)
text_input.submit(
fn=process_inputs,
inputs=[image_input, text_input, model_dropdown],
outputs=output_label
)
model_dropdown.change(
fn=process_inputs,
inputs=[image_input, text_input, model_dropdown],
outputs=output_label
)
gr.Examples(
examples=[
[
"cat.jpg",
"a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
"ViT-B/16"
]
],
fn=process_inputs,
inputs=[image_input, text_input, model_dropdown],
outputs=output_label,
)
demo.launch()