Qwen2-VL-2B

Runtime error

App Files Files Community

vykanand commited on 19 days ago

Commit

bd6f71c

verified ·

1 Parent(s): 73d58c2

Create new-app.py

Browse files

Files changed (1) hide show

new-app.py +128 -0

new-app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import gradio as gr
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import uuid
+import io
+from PIL import Image
+from threading import Thread
+# Define model options (for the OCR model specifically)
+MODEL_OPTIONS = {
+    "Latex OCR": "prithivMLmods/Qwen2-VL-OCR-2B-Instruct",
+}
+# Preload models and processors into CUDA
+models = {}
+processors = {}
+for name, model_id in MODEL_OPTIONS.items():
+    print(f"Loading {name}...")
+    models[name] = Qwen2VLForConditionalGeneration.from_pretrained(
+        model_id,
+        trust_remote_code=True,
+        torch_dtype=torch.float16
+    ).to("cuda").eval()
+    processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+image_extensions = Image.registered_extensions()
+def identify_and_save_blob(blob_path):
+    """Identifies if the blob is an image and saves it."""
+    try:
+        with open(blob_path, 'rb') as file:
+            blob_content = file.read()
+            try:
+                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image
+                extension = ".png"  # Default to PNG for saving
+                media_type = "image"
+            except (IOError, SyntaxError):
+                raise ValueError("Unsupported media type. Please upload a valid image.")
+            filename = f"temp_{uuid.uuid4()}_media{extension}"
+            with open(filename, "wb") as f:
+                f.write(blob_content)
+            return filename, media_type
+    except FileNotFoundError:
+        raise ValueError(f"The file {blob_path} was not found.")
+    except Exception as e:
+        raise ValueError(f"An error occurred while processing the file: {e}")
+def qwen_inference(model_name, media_input, text_input=None):
+    """Handles inference for the selected model."""
+    model = models[model_name]
+    processor = processors[model_name]
+    if isinstance(media_input, str):
+        media_path = media_input
+        if media_path.endswith(tuple([i for i in image_extensions.keys()])):
+            media_type = "image"
+        else:
+            try:
+                media_path, media_type = identify_and_save_blob(media_input)
+            except Exception as e:
+                raise ValueError("Unsupported media type. Please upload a valid image.")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": media_type,
+                    media_type: media_path
+                },
+                {"type": "text", "text": text_input},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, _ = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to("cuda")
+    streamer = TextIteratorStreamer(
+        processor.tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
+    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        # Remove <|im_end|> or similar tokens from the output
+        buffer = buffer.replace("<|im_end|>", "")
+        yield buffer
+def ocr_endpoint(image, question):
+    """This function will be exposed to the /ocr endpoint for OCR processing."""
+    return qwen_inference("Latex OCR", image, question)
+# Gradio app setup for OCR endpoint
+with gr.Blocks() as demo:
+    gr.Markdown("# Qwen2VL OCR Model - Latex OCR")
+    with gr.Row():
+        with gr.Column():
+            input_media = gr.File(label="Upload Image", type="filepath")
+            text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image...")
+            submit_btn = gr.Button(value="Submit")
+        with gr.Column():
+            output_text = gr.Textbox(label="Output Text", lines=10)
+    submit_btn.click(
+        ocr_endpoint, [input_media, text_input], [output_text]
+    )
+# Launch the app on the /ocr endpoint
+demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True)