Florence-2-DEMO

Sleeping

App Files Files Community

yuting89830 commited on Oct 22, 2024

Commit

f213fc3

verified ·

1 Parent(s): a8f49dd

Upload 3 files

Browse files

Files changed (2) hide show

app.py +13 -28
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,40 +1,40 @@
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForCausalLM
 import spaces
 from PIL import Image
 import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cuda").eval()
-processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True)
-TITLE = "# [Florence-2-DocVQA Demo](https://huggingface.co/HuggingFaceM4/Florence-2-DocVQA)"
-DESCRIPTION = "The demo for Florence-2 fine-tuned on DocVQA dataset. You can find the notebook [here](https://colab.research.google.com/drive/1hKDrJ5AH_o7I95PtZ9__VlCTNAo1Gjpf?usp=sharing). Read more about Florence-2 fine-tuning [here](finetune-florence2)."
 colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
             'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
-@spaces.GPU
 def run_example(task_prompt, image, text_input=None):
     if text_input is None:
         prompt = task_prompt
     else:
         prompt = task_prompt + text_input
-    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
     generated_ids = model.generate(
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
-        max_new_tokens=1024,
-        early_stopping=False,
         do_sample=False,
-        num_beams=3,
     )
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
     parsed_answer = processor.post_process_generation(
@@ -61,7 +61,6 @@ css = """
 with gr.Blocks(css=css) as demo:
     gr.Markdown(TITLE)
-    gr.Markdown(DESCRIPTION)
     with gr.Tab(label="Florence-2 Image Captioning"):
         with gr.Row():
             with gr.Column():
@@ -71,20 +70,6 @@ with gr.Blocks(css=css) as demo:
             with gr.Column():
                 output_text = gr.Textbox(label="Output Text")
-        gr.Examples(
-            examples=[
-                ["hunt.jpg", 'What is this image?'],
-                ["idefics2_architecture.png", 'How many tokens per image does it use?'],
-                ["idefics2_architecture.png", "What type of encoder does the model use?"],
-                ["image.jpg", "What's the share of Industry Switchers Gained?"]
-            ],
-            inputs=[input_img, text_input],
-            outputs=[output_text],
-            fn=process_image,
-            cache_examples=True,
-            label='Try the examples below'
-        )
         submit_btn.click(process_image, [input_img, text_input], [output_text])
-demo.launch(debug=True)

 import gradio as gr
 from transformers import AutoProcessor, AutoModelForCausalLM
 import spaces
+import torch
 from PIL import Image
 import subprocess
+# subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+torch.set_num_threads(4)
+model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cpu").eval()
+processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True)
+model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+TITLE = "# [Florence-2-DocVQA Demo]"
 colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
             'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
+# @spaces.GPU
 def run_example(task_prompt, image, text_input=None):
     if text_input is None:
         prompt = task_prompt
     else:
         prompt = task_prompt + text_input
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cpu")
     generated_ids = model.generate(
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
+        max_new_tokens=64,
+        early_stopping=True,
         do_sample=False,
+        num_beams=1,
     )
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
     parsed_answer = processor.post_process_generation(
 with gr.Blocks(css=css) as demo:
     gr.Markdown(TITLE)
     with gr.Tab(label="Florence-2 Image Captioning"):
         with gr.Row():
             with gr.Column():
             with gr.Column():
                 output_text = gr.Textbox(label="Output Text")
         submit_btn.click(process_image, [input_img, text_input], [output_text])
+demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 spaces
 transformers
-timm

 spaces
 transformers
+timm
+einops