Spaces:

davanstrien
/

ColPali-Query-Generator

Running on Zero

App Files Files Community

davanstrien HF staff commited on 7 days ago

Commit

4480433

verified ·

1 Parent(s): e2a38d3

switch to qwen2.5 vl

Browse files

Files changed (1) hide show

app.py +68 -39

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
-# import subprocess  # 🥲
-# subprocess.run(
-#     "pip install flash-attn --no-build-isolation",
-#     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-#     shell=True,
-# )
 import spaces
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 import torch
 import os
 import json
@@ -17,19 +18,15 @@ from typing import Tuple
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-# Load Molmo model
-model = AutoModelForCausalLM.from_pretrained(
-    'allenai/Molmo-7B-D-0924',
-    trust_remote_code=True,
-    torch_dtype='auto',
-    device_map='auto'
-)
-processor = AutoProcessor.from_pretrained(
-    'allenai/Molmo-7B-D-0924',
-    trust_remote_code=True,
-    torch_dtype='auto',
-    device_map='auto'
 )
 class GeneralRetrievalQuery(BaseModel):
     broad_topical_query: str
@@ -39,6 +36,7 @@ class GeneralRetrievalQuery(BaseModel):
     visual_element_query: str
     visual_element_explanation: str
 def get_retrieval_prompt(prompt_name: str) -> Tuple[str, GeneralRetrievalQuery]:
     if prompt_name != "general":
         raise ValueError("Only 'general' prompt is available in this version")
@@ -72,46 +70,77 @@ Format your response as a JSON object with the following structure:
 If there are no relevant visual elements, replace the third query with another specific detail query.
 Here is the document image to analyze:
-Generate the queries based on this image and provide the response in the specified JSON format.
-Only return JSON. Don't return any extra explanation text. """
     return prompt, GeneralRetrievalQuery
 prompt, pydantic_model = get_retrieval_prompt("general")
 def _prep_data_for_input(image):
-    return processor.process(
-        images=[image],
-        text=prompt
     )
-@spaces.GPU(duration=120)
 def generate_response(image):
     inputs = _prep_data_for_input(image)
-    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
-    output = model.generate_from_batch(
-        inputs,
-        GenerationConfig(max_new_tokens=800, stop_token="<|endoftext|>"),
-        tokenizer=processor.tokenizer
     )
-    generated_tokens = output[0, inputs['input_ids'].size(1):]
-    output_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
     try:
-        return str(json.loads(output_text))
     except Exception:
         gr.Warning("Failed to parse JSON from output")
-        return output_text
-title = "ColPali fine-tuning Query Generator"
 description = """[ColPali](https://huggingface.co/papers/2407.01449) is a very exciting new approach to multimodal document retrieval which aims to replace existing document retrievers which often rely on an OCR step with an end-to-end multimodal approach.
 To train or fine-tune a ColPali model, we need a dataset of image-text pairs which represent the document images and the relevant text queries which those documents should match.
 To make the ColPali models work even better we might want a dataset of query/image document pairs related to our domain or task.
 One way in which we might go about generating such a dataset is to use a VLM to generate synthetic queries for us.
-This space uses the [allenai/Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924) model to generate queries for a document, based on an input document image.
 **Note** there is a lot of scope for improving to prompts and the quality of the generated queries! If you have any suggestions for improvements please [open a Discussion](https://huggingface.co/spaces/davanstrien/ColPali-Query-Generator/discussions/new)!
@@ -128,7 +157,7 @@ examples = [
 demo = gr.Interface(
     fn=generate_response,
     inputs=gr.Image(type="pil"),
-    outputs=gr.Text(),
     title=title,
     description=description,
     examples=examples,

+import subprocess  # 🥲
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 import spaces
 import gradio as gr
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
 import torch
 import os
 import json
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2.5-VL-7B-Instruct",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto",
 )
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
 class GeneralRetrievalQuery(BaseModel):
     broad_topical_query: str
     visual_element_query: str
     visual_element_explanation: str
 def get_retrieval_prompt(prompt_name: str) -> Tuple[str, GeneralRetrievalQuery]:
     if prompt_name != "general":
         raise ValueError("Only 'general' prompt is available in this version")
 If there are no relevant visual elements, replace the third query with another specific detail query.
 Here is the document image to analyze:
+<image>
+Generate the queries based on this image and provide the response in the specified JSON format."""
     return prompt, GeneralRetrievalQuery
+# defined like this so we can later add more prompting options
 prompt, pydantic_model = get_retrieval_prompt("general")
 def _prep_data_for_input(image):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    return processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
     )
+@spaces.GPU
 def generate_response(image):
     inputs = _prep_data_for_input(image)
+    inputs = inputs.to("cuda")
+    generated_ids = model.generate(**inputs, max_new_tokens=200)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
     )
     try:
+        return json.loads(output_text[0])
     except Exception:
         gr.Warning("Failed to parse JSON from output")
+        return {}
+title = "ColPali Query Generator using Qwen2.5-VL"
 description = """[ColPali](https://huggingface.co/papers/2407.01449) is a very exciting new approach to multimodal document retrieval which aims to replace existing document retrievers which often rely on an OCR step with an end-to-end multimodal approach.
 To train or fine-tune a ColPali model, we need a dataset of image-text pairs which represent the document images and the relevant text queries which those documents should match.
 To make the ColPali models work even better we might want a dataset of query/image document pairs related to our domain or task.
 One way in which we might go about generating such a dataset is to use a VLM to generate synthetic queries for us.
+This space uses the [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) VLM model to generate queries for a document, based on an input document image.
 **Note** there is a lot of scope for improving to prompts and the quality of the generated queries! If you have any suggestions for improvements please [open a Discussion](https://huggingface.co/spaces/davanstrien/ColPali-Query-Generator/discussions/new)!
 demo = gr.Interface(
     fn=generate_response,
     inputs=gr.Image(type="pil"),
+    outputs=gr.Json(),
     title=title,
     description=description,
     examples=examples,