import torch
import torch.nn.functional as F
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces

# Dictionary of available CLIP models with their image sizes
CLIP_MODELS = {
    "ViT-B/32": ("openai/clip-vit-base-patch32", 224),
    "ViT-B/16": ("openai/clip-vit-base-patch16", 224),
    "ViT-L/14": ("openai/clip-vit-large-patch14", 224),
    "ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
}

# Initialize models and processors
models = {}
processors = {}

for model_name, (model_path, _) in CLIP_MODELS.items():
    models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
    processors[model_name] = CLIPProcessor.from_pretrained(model_path)

@spaces.GPU
def calculate_score(image, text, model_name):
    labels = text.split(";")
    labels = [l.strip() for l in labels]
    labels = list(filter(None, labels))
    if len(labels) == 0:
        return dict()
    
    model = models[model_name]
    processor = processors[model_name]
    
    # Preprocess the image and text
    inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    # Calculate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds
    
    # Normalize embeddings
    image_embeds = F.normalize(image_embeds, p=2, dim=1)
    text_embeds = F.normalize(text_embeds, p=2, dim=1)
    
    # Calculate cosine similarity
    cosine_similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
    
    # Convert to percentages
    percentages = ((cosine_similarities + 1) / 2 * 100).cpu().numpy()
    
    results_dict = {label: float(score) for label, score in zip(labels, percentages)}
    return results_dict

with gr.Blocks() as demo:
    gr.Markdown("# Multi-Model CLIP Score")
    gr.Markdown("Calculate the CLIP score (cosine similarity) between the given image and text descriptions using different CLIP model variants")
    
    with gr.Row():
        image_input = gr.Image(type="pil")
        output_label = gr.Label()
    
    with gr.Row():
        text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
        model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
    
    def process_inputs(image, text, model_name):
        if image is None or text.strip() == "":
            return None
        return calculate_score(image, text, model_name)
    
    inputs = [image_input, text_input, model_dropdown]
    outputs = output_label
    
    image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
    text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
    model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)
    
    gr.Examples(
        examples=[
            [
                "cat.jpg",
                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
                "ViT-B/16"
            ]
        ],
        fn=process_inputs,
        inputs=inputs,
        outputs=outputs,
    )

demo.launch()