Spaces:

hgdgng
/

HG_Llama3.2

Runtime error

File size: 1,607 Bytes

d989353
 
 
903f1a6
d0df95e
903f1a6
d989353
08c738e
903f1a6
 
d989353
903f1a6
 
d989353
903f1a6
 
d989353
d0df95e
903f1a6
d989353
903f1a6
 
d989353
d0df95e
903f1a6
d989353
 
903f1a6
 
d989353
 
903f1a6
 
d989353
903f1a6
 
d989353
4bc7f77
903f1a6

import requests
import torch
from PIL import Image
from transformers import LlamaForConditionalGeneration, AutoProcessor

# Define the model ID, replace with the correct ID if needed
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Load the model in bfloat16 or float16 if needed
model = LlamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,  # Change to torch.float16 if hardware doesn't support bfloat16
    device_map="auto",  # Automatically selects the appropriate device
)

# Load the processor
processor = AutoProcessor.from_pretrained(model_id)

# Define an image URL
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"

# Fetch the image using requests
image = Image.open(requests.get(url, stream=True).raw)

# Define the messages in a format the model understands (adjust as needed)
messages = [
    {"role": "user", "content": [
        {"type": "image"},  # This indicates that the input contains an image
        {"type": "text", "text": "Can you please describe this image in one sentence?"}
    ]}
]

# Generate input text with the processor
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

# Process the image and input text, prepare them for the model
inputs = processor(image, input_text, return_tensors="pt").to(model.device)

# Run the model to generate a response
output = model.generate(**inputs, max_new_tokens=70)

# Decode and print the output
print(processor.decode(output[0][inputs["input_ids"].shape[-1]:]))