File size: 2,493 Bytes
ade70cf
1d51385
d69fd19
f213fc3
d7f29ce
ef3da92
d7f29ce
39ae23a
f213fc3
ade70cf
f213fc3
d502400
f213fc3
ade70cf
f213fc3
 
d256f3b
f213fc3
833928a
ca16909
d256f3b
 
 
f213fc3
50fae8a
1d51385
 
 
 
f213fc3
ade70cf
 
 
f213fc3
 
ade70cf
f213fc3
ade70cf
 
 
1d51385
 
ade70cf
 
c8f76e0
 
 
6172e67
c8f76e0
 
ccd3ca3
c8f76e0
6172e67
 
 
 
 
 
 
 
 
 
833928a
6172e67
1d51385
6172e67
 
 
 
 
 
 
ccd3ca3
6172e67
f213fc3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
import torch

from PIL import Image 

import subprocess
# subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

torch.set_num_threads(4)

model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cpu").eval()

processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True)
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

TITLE = "# [Florence-2-DocVQA Demo]"


colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
            'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']

# @spaces.GPU
def run_example(task_prompt, image, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cpu")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=64,
        early_stopping=True,
        do_sample=False,
        num_beams=1,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )
    return parsed_answer

def process_image(image, text_input=None):
    image = Image.fromarray(image)  # Convert NumPy array to PIL Image
    task_prompt = '<DocVQA>'
    results = run_example(task_prompt, image, text_input)[task_prompt].replace("<pad>", "")
    return results


css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(TITLE)
    with gr.Tab(label="Florence-2 Image Captioning"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                text_input = gr.Textbox(label="Text Input (optional)")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        submit_btn.click(process_image, [input_img, text_input], [output_text])

demo.launch(debug=True)