Spaces:
Running
Running
import gradio as gr | |
from transformers import CLIPProcessor, CLIPModel | |
from PIL import Image | |
import torch | |
# Load the model and processor | |
model = CLIPModel.from_pretrained("geolocal/StreetCLIP") | |
processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP") | |
def classify_image(image): | |
# Example labels for classification | |
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a tree"] | |
# Preprocess the image and text | |
inputs = processor(text=labels, images=image, return_tensors="pt", padding=True) | |
# Perform the inference | |
outputs = model(**inputs) | |
# Postprocess the outputs | |
logits_per_image = outputs.logits_per_image # this is the image-text similarity score | |
probs = logits_per_image.softmax(dim=1) # we can use softmax to get probabilities | |
# Convert the probabilities to a list | |
probs_list = probs.tolist()[0] | |
# Create a dictionary of labels and probabilities | |
result = {label: prob for label, prob in zip(labels, probs_list)} | |
return result | |
# Define Gradio interface | |
iface = gr.Interface( | |
fn=classify_image, | |
inputs=gr.Image(type="pil"), | |
outputs="label", | |
title="Geolocal StreetCLIP Classification", | |
description="Upload an image to classify using Geolocal StreetCLIP" | |
) | |
# Launch the interface | |
iface.launch() |