import gradio as gr
from transformers import AutoProcessor, CLIPModel


# Charger le pipeline

model = CLIPModel.from_pretrained("patrickjohncyh/fashion-clip")
processor = AutoProcessor.from_pretrained("patrickjohncyh/fashion-clip")

# Définir la fonction pour la classification d'image avec du texte en entrée
def classify_image_with_text(text, image):
    # Effectuer la classification d'image à l'aide du texte
    inputs = processor(
            text=["a photo of a man", "a photo of woman"], images=image, return_tensors="pt", padding=True
    )
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)  
    # Récupérer l'index de la prédiction avec le score le plus élevé
    predicted_class_index = probs.argmax(dim=1).item()
    # Récupérer le tokenizer associé au processeur
    tokenizer = processor.tokenizer
    # Récupérer le label correspondant à l'index
    label = tokenizer.convert_ids_to_tokens(predicted_class_index)
    return label
    
# Créer l'interface Gradio avec l'API de Gradio Blocks
with gr.Interface(
    fn=classify_image_with_text,
    inputs=[gr.Textbox(lines=1, label="Prompt"), gr.Image(label="Image")],
    outputs=gr.Textbox(label='Sortie de l\'API'),
    title="SD Models"
) as iface:
    iface.launch()