noamrot's picture
added use of ZERO so it runs on GPU
060b48a verified
raw
history blame
1.67 kB
import gradio as gr
from PIL import Image
import spaces
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = BlipProcessor.from_pretrained("noamrot/FuseCap_Image_Captioning")
model = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap_Image_Captioning").to(device)
@spaces.GPU(duration=15)
def inference(raw_image):
cur_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
text = "a picture of "
inputs = processor(raw_image, text, return_tensors="pt").to(cur_device)
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
return caption
inputs = [gr.Image(type='pil', interactive=False),]
# outputs = gr.outputs.Textbox(label="Caption")
outputs = gr.Textbox(label="Caption")
description = "Gradio demo for FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions. This demo features a BLIP-based model, trained using FuseCap."
examples = [["surfer.jpg"], ["bike.jpg"]]
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2305.17718' target='_blank'>FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions</a>"
iface = gr.Interface(fn=inference,
inputs="image",
outputs="text",
title="FuseCap",
description=description,
article=article,
examples=examples,
# enable_queue=True
)
iface.queue()
iface.launch()