import gradio as gr from transformers import CLIPProcessor, CLIPModel from PIL import Image import requests # Load the model and processor model = CLIPModel.from_pretrained("geolocal/StreetCLIP") processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP") def classify_image(image): # Preprocess the image inputs = processor(images=image, return_tensors="pt") # Perform the inference outputs = model(**inputs) # Postprocess the outputs logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1) # we can use softmax to get probabilities return probs # Define Gradio interface iface = gr.Interface( fn=classify_image, inputs=gr.inputs.Image(type="pil"), outputs="text", title="Geolocal StreetCLIP Classification", description="Upload an image to classify using Geolocal StreetCLIP" ) # Launch the interface iface.launch()