Spaces:
Running
Running
File size: 2,493 Bytes
ade70cf 1d51385 d69fd19 f213fc3 d7f29ce ef3da92 d7f29ce 39ae23a f213fc3 ade70cf f213fc3 d502400 f213fc3 ade70cf f213fc3 d256f3b f213fc3 833928a ca16909 d256f3b f213fc3 50fae8a 1d51385 f213fc3 ade70cf f213fc3 ade70cf f213fc3 ade70cf 1d51385 ade70cf c8f76e0 6172e67 c8f76e0 ccd3ca3 c8f76e0 6172e67 833928a 6172e67 1d51385 6172e67 ccd3ca3 6172e67 f213fc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
import torch
from PIL import Image
import subprocess
# subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
torch.set_num_threads(4)
model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cpu").eval()
processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True)
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
TITLE = "# [Florence-2-DocVQA Demo]"
colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
# @spaces.GPU
def run_example(task_prompt, image, text_input=None):
if text_input is None:
prompt = task_prompt
else:
prompt = task_prompt + text_input
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cpu")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=64,
early_stopping=True,
do_sample=False,
num_beams=1,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(
generated_text,
task=task_prompt,
image_size=(image.width, image.height)
)
return parsed_answer
def process_image(image, text_input=None):
image = Image.fromarray(image) # Convert NumPy array to PIL Image
task_prompt = '<DocVQA>'
results = run_example(task_prompt, image, text_input)[task_prompt].replace("<pad>", "")
return results
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(TITLE)
with gr.Tab(label="Florence-2 Image Captioning"):
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Picture")
text_input = gr.Textbox(label="Text Input (optional)")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
submit_btn.click(process_image, [input_img, text_input], [output_text])
demo.launch(debug=True)
|