import torch
from transformers import pipeline, AutoTokenizer
import gradio as gr

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.clean_up_tokenization_spaces = False  # Explicitly set the parameter if needed

# Load CLIP model for zero-shot classification
clip_checkpoint = "openai/clip-vit-base-patch16"
clip_detector = pipeline(model=clip_checkpoint, task="zero-shot-image-classification")

# Postprocess the output from CLIP
def postprocess(output):
    return {out["label"]: float(out["score"]) for out in output}

# Inference function for CLIP
def infer(image, candidate_labels):
    candidate_labels = [label.lstrip(" ") for label in candidate_labels.split(",")]
    clip_out = clip_detector(image, candidate_labels=candidate_labels)
    return postprocess(clip_out)

# Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# Custom Classification")
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil")
            text_input = gr.Textbox(label="Input a list of labels")
            run_button = gr.Button("Run")

        with gr.Column():
            clip_output = gr.Label(label="Output", num_top_classes=3)
    
    examples = [["image_8.webp", "girl, boy, lgbtq"]]
    gr.Examples(
        examples=examples, 
        inputs=[image_input, text_input],
        outputs=[clip_output],
        fn=infer,
        cache_examples=True
    )
    
    run_button.click(fn=infer,
                     inputs=[image_input, text_input],
                     outputs=[clip_output])

app.launch()