import gradio as gr | |
from PIL import Image | |
import spaces | |
import torch | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
processor = BlipProcessor.from_pretrained("noamrot/FuseCap_Image_Captioning") | |
model = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap_Image_Captioning").to(device) | |
def inference(raw_image): | |
cur_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model = model.to(device) | |
text = "a picture of " | |
inputs = processor(raw_image, text, return_tensors="pt").to(cur_device) | |
out = model.generate(**inputs) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
return caption | |
inputs = [gr.Image(type='pil', interactive=False),] | |
# outputs = gr.outputs.Textbox(label="Caption") | |
outputs = gr.Textbox(label="Caption") | |
description = "Gradio demo for FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions. This demo features a BLIP-based model, trained using FuseCap." | |
examples = [["surfer.jpg"], ["bike.jpg"]] | |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2305.17718' target='_blank'>FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions</a>" | |
iface = gr.Interface(fn=inference, | |
inputs="image", | |
outputs="text", | |
title="FuseCap", | |
description=description, | |
article=article, | |
examples=examples, | |
# enable_queue=True | |
) | |
iface.queue() | |
iface.launch() | |