ONNX Demo code?
I'm trying to use the ONNX models, and I've getting bad output. Is there any working demo code available?
This is the code I'm using:
import onnxruntime as ort
from PIL import Image
import requests
import numpy as np
from transformers import AutoTokenizer, AutoProcessor
# Load the tokenizer and processor
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
# Download and load the image
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
image = Image.open(requests.get(url, stream=True).raw)
# Preprocess the image
image = image.resize((384, 384)) # Adjust size as needed
image_array = np.array(image).astype(np.float32) / 255.0
image_array = np.transpose(image_array, (2, 0, 1)) # CHW format
image_array = np.expand_dims(image_array, axis=0) # Add batch dimension
# Load ONNX model
vision_encoder_session = ort.InferenceSession("/Users/jameskelly/Downloads/vision_encoder.onnx")
decoder_session = ort.InferenceSession("/Users/jameskelly/Downloads/decoder_model_merged.onnx")
embed_tokens_session = ort.InferenceSession("/Users/jameskelly/Downloads/embed_tokens.onnx")
# Run vision encoder
vision_input_name = vision_encoder_session.get_inputs()[0].name
vision_output_name = vision_encoder_session.get_outputs()[0].name
vision_features = vision_encoder_session.run([vision_output_name], {vision_input_name: image_array})[0]
# Prepare text input
conversation = [
{
"role": "system",
"content": "You are a helpful assistant that answers questions about images."
},
{
"role": "user",
"content": [
{"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
{"type": "image"},
],
},
]
# Apply chat template
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="np", padding=True, truncation=True)
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
# Prepare inputs
sequence_length = input_ids.shape[1]
batch_size = 1
num_layers = 24
head_dim = 64
num_heads = 16
past_sequence_length = 0 # Set to 0 for the initial pass
# Attention mask
attention_mask = np.ones((batch_size, past_sequence_length + sequence_length), dtype=np.int64)
# Position IDs
position_ids = np.arange(sequence_length, dtype=np.int64).reshape(1, -1)
# Past Key Values
past_key_values = {
f"past_key_values.{i}.key": np.zeros((batch_size, num_heads, past_sequence_length, head_dim), dtype=np.float32)
for i in range(num_layers)
}
past_key_values.update({
f"past_key_values.{i}.value": np.zeros((batch_size, num_heads, past_sequence_length, head_dim), dtype=np.float32)
for i in range(num_layers)
})
# Run embed tokens
embed_input_name = embed_tokens_session.get_inputs()[0].name
embed_output_name = embed_tokens_session.get_outputs()[0].name
token_embeddings = embed_tokens_session.run([embed_output_name], {embed_input_name: input_ids})[0]
# Combine token embeddings and vision features
combined_embeddings = np.concatenate([token_embeddings, vision_features], axis=1)
# Update attention_mask and position_ids
combined_length = combined_embeddings.shape[1]
attention_mask = np.ones((batch_size, combined_length), dtype=np.int64)
position_ids = np.arange(combined_length, dtype=np.int64).reshape(1, -1)
# Combine all inputs
decoder_inputs = {
"attention_mask": attention_mask,
"position_ids": position_ids,
"inputs_embeds": combined_embeddings,
**past_key_values
}
# Print input shapes
for name, value in decoder_inputs.items():
print(f"{name} shape: {value.shape}")
# Run the decoder
decoder_input_names = [input.name for input in decoder_session.get_inputs()]
decoder_output_name = decoder_session.get_outputs()[0].name
outputs = decoder_session.run([decoder_output_name], {name: decoder_inputs[name] for name in decoder_input_names if name in decoder_inputs})[0]
# ... (previous code remains the same until after the decoder run)
print(f"Outputs shape: {outputs.shape}")
print(f"Outputs type: {outputs.dtype}")
# Print input token IDs
print(f"Input token IDs: {input_ids[0].tolist()}")
# Process outputs (decode tokens to text)
generated_tokens = []
eos_token_id = tokenizer.eos_token_id
max_new_tokens = 50
for i in range(max_new_tokens):
logits = outputs[0, i]
token_id = np.argmax(logits)
if token_id == eos_token_id:
break
generated_tokens.append(token_id)
# Print top 5 probable tokens for each step
top_tokens = np.argsort(logits)[-5:][::-1]
print(f"Step {i+1}: Top 5 tokens: {[(t, tokenizer.decode([t]), logits[t]) for t in top_tokens]}")
# Prepare input for next token generation
new_input_embeds = embed_tokens_session.run([embed_output_name], {embed_input_name: np.array([[token_id]])})[0]
combined_embeddings = np.concatenate([combined_embeddings, new_input_embeds], axis=1)
attention_mask = np.ones((1, combined_embeddings.shape[1]), dtype=np.int64)
position_ids = np.arange(combined_embeddings.shape[1], dtype=np.int64).reshape(1, -1)
decoder_inputs = {
"attention_mask": attention_mask,
"position_ids": position_ids,
"inputs_embeds": combined_embeddings,
**past_key_values
}
outputs = decoder_session.run([decoder_output_name], {name: decoder_inputs[name] for name in decoder_input_names if name in decoder_inputs})[0]
# Convert to list of integers
token_ids = [int(token) for token in generated_tokens]
print(f"Generated token IDs: {token_ids}")
# Decode tokens one by one
decoded_tokens = [tokenizer.decode([token]) for token in token_ids]
print(f"Decoded tokens: {decoded_tokens}")
# Full decoded output
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
print(f"Full decoded output: {decoded_output}")
The final output contains Chinese characters "可以通过 plea".
I assume I'm doing something wrong, any help would be greatly appreciated.
cc @Xenova who added ONNX converted weights
Bro How To Use ONNX Can You Plz Explain Or Provide A code
@james-passio looks like the past key values aren't being updated - maybe something to look into.
Thanks @Xenova ,do you have a working example of how to use these ONNX models? That would be really useful
All usage I've worked on is via Transformers.js (i.e., in JavaScript). Here is a demo space I created for it: https://huggingface.co/spaces/llava-hf/llava-webgpu
and the code snippet will be very similar to https://huggingface.co/onnx-community/nanoLLaVA-1.5 (see README).
Feel free to look at the Transformers.js source code and adapt it into a python version: http://github.com/xenova/transformers.js :) The above looks very close, you just need to:
(1) Update the PKVs after generation
(2) Update the attention mask and position IDs