|
import gradio as gr |
|
from transformers import AutoProcessor, AutoModelForImageTextToText |
|
import torch |
|
from PIL import Image |
|
|
|
|
|
processor = AutoProcessor.from_pretrained("guneetsk99/finance_qwen_VL_7B") |
|
model = AutoModelForImageTextToText.from_pretrained("guneetsk99/finance_qwen_VL_7B") |
|
|
|
def predict(input_img, text_prompt): |
|
|
|
inputs = processor(images=input_img, text=text_prompt, return_tensors="pt").to(model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model.generate(**inputs, max_new_tokens=50) |
|
|
|
|
|
generated_text = processor.decode(outputs[0], skip_special_tokens=True) |
|
|
|
return input_img, generated_text |
|
|
|
|
|
gradio_app = gr.Interface( |
|
fn=predict, |
|
inputs=[ |
|
gr.Image(label="Upload Image", source="upload", type="pil"), |
|
gr.Textbox(label="Text Prompt", placeholder="Enter a text prompt, e.g., 'Describe this image.'"), |
|
], |
|
outputs=[ |
|
gr.Image(label="Uploaded Image"), |
|
gr.Textbox(label="Generated Response"), |
|
], |
|
title="Finance Image-to-Text Model", |
|
description="Upload a financial document image and provide a text prompt for the model to process the image and generate a text response.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
gradio_app.launch() |
|
|