|
import torch |
|
import transformers |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from PIL import Image |
|
import requests |
|
import gradio as gr |
|
import spaces |
|
|
|
|
|
model_name = 'scb10x/llama-3-typhoon-v1.5-8b-instruct-vision-preview' |
|
|
|
@spaces.GPU(duration=120) |
|
def load_model(): |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=torch.float16, |
|
device_map='auto', |
|
trust_remote_code=True |
|
) |
|
return model |
|
|
|
model = load_model() |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
def prepare_inputs(text, image): |
|
messages = [ |
|
{"role": "system", "content": "You are a helpful vision-capable assistant who eagerly converses with the user in their language."}, |
|
{"role": "user", "content": f"<image>\n{text}"} |
|
] |
|
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) |
|
return inputs |
|
|
|
@spaces.GPU(duration=60) |
|
def predict(prompt, img_url): |
|
try: |
|
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') |
|
image = image.resize((model.config.image_size, model.config.image_size)) |
|
image_tensor = model.preprocess_images([image]).to(model.device) |
|
|
|
inputs = prepare_inputs(prompt, image) |
|
|
|
output_ids = model.generate( |
|
inputs, |
|
images=image_tensor, |
|
max_new_tokens=100, |
|
do_sample=True, |
|
temperature=0.2, |
|
top_p=0.2, |
|
repetition_penalty=1.0 |
|
)[0] |
|
|
|
result = tokenizer.decode(output_ids[inputs.shape[1]:], skip_special_tokens=True).strip() |
|
return result |
|
except Exception as e: |
|
return str(e) |
|
|
|
|
|
inputs = [ |
|
gr.Textbox(label="Prompt", placeholder="Ask about the food in the image"), |
|
gr.Textbox(label="Image URL", placeholder="Enter an image URL") |
|
] |
|
outputs = gr.Textbox(label="Generated Output") |
|
|
|
gr.Interface( |
|
fn=predict, |
|
inputs=inputs, |
|
outputs=outputs, |
|
title="Food Image AI Assistant", |
|
description="This model can analyze food images and answer questions about them." |
|
).launch() |