File size: 1,403 Bytes

512e1cc
 
3d6b1e3
512e1cc
 
3d6b1e3
512e1cc
 
 
3d6b1e3
 
 
512e1cc
 
3d6b1e3
 
512e1cc
 
 
 
3d6b1e3
512e1cc
 
 
3d6b1e3
 
 
 
 
512e1cc
 
3d6b1e3
512e1cc
3d6b1e3
 
512e1cc

import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from PIL import Image

# Load the processor and model
processor = AutoProcessor.from_pretrained("guneetsk99/finance_qwen_VL_7B")
model = AutoModelForImageTextToText.from_pretrained("guneetsk99/finance_qwen_VL_7B")

def predict(input_img, text_prompt):
    # Preprocess the image and text prompt
    inputs = processor(images=input_img, text=text_prompt, return_tensors="pt").to(model.device)
    
    # Generate predictions using the model
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)
    
    # Decode the generated text
    generated_text = processor.decode(outputs[0], skip_special_tokens=True)
    
    return input_img, generated_text

# Create the Gradio interface
gradio_app = gr.Interface(
    fn=predict,
    inputs=[
        gr.Image(label="Upload Image", source="upload", type="pil"),
        gr.Textbox(label="Text Prompt", placeholder="Enter a text prompt, e.g., 'Describe this image.'"),
    ],
    outputs=[
        gr.Image(label="Uploaded Image"),
        gr.Textbox(label="Generated Response"),
    ],
    title="Finance Image-to-Text Model",
    description="Upload a financial document image and provide a text prompt for the model to process the image and generate a text response.",
)

if __name__ == "__main__":
    gradio_app.launch()