File size: 1,448 Bytes
aaefaa5
6c77aaa
d095afa
6c77aaa
224bafa
6c77aaa
060b48a
3e9afa5
 
6c77aaa
d095afa
6c77aaa
224bafa
40e14d7
224bafa
 
 
6c77aaa
40e14d7
9a86c7a
6c77aaa
e3b65d8
9578b89
e3b65d8
c38662a
a3b147e
40e14d7
 
 
 
 
 
d3c8f24
 
a3b147e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import gradio as gr
from PIL import Image
# import spaces
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = BlipProcessor.from_pretrained("noamrot/FuseCap_Image_Captioning")
model = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap_Image_Captioning").to(device)

# @spaces.GPU(duration=15)
def inference(raw_image):
    text = "a picture of "
    inputs = processor(raw_image, text, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

inputs = gr.Image(type="pil", interactive=False)
outputs = gr.Textbox(label="Caption")

description = "Gradio demo for FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions. This demo features a BLIP-based model, trained using FuseCap."
examples = [["surfer.jpg"], ["bike.jpg"]]
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2305.17718' target='_blank'>FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions</a>"

iface = gr.Interface(fn=inference, 
                     inputs=inputs,
                     outputs=outputs, 
                     title="FuseCap",
                     description=description,
                     article=article,
                     examples=examples)

iface.queue()
iface.launch()