import gradio as gr from transformers import pipeline from PIL import Image import requests from transformers import BlipProcessor, BlipForConditionalGeneration # Initialize the pipeline pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") # Initialize processor and model processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") def image_caption(image, text_prompt=None): # Conditional image captioning if text prompt is provided if text_prompt: inputs = processor(image, text_prompt, return_tensors="pt") out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) else: # Unconditional image captioning inputs = processor(image, return_tensors="pt") out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) return caption # Define the Gradio interface image_input = gr.Image(type="pil", label="Upload an Image") text_input = gr.Textbox(lines=1, placeholder="Optional: Enter text prompt", label="Text Prompt") output = gr.Textbox(label="Generated Caption") gr.Interface( fn=image_caption, inputs=[image_input, text_input], outputs=output, title="Image Captioning with BLIP", description="Upload an image and get a generated caption. Optionally, provide a text prompt for conditional captioning." ).launch()