metadata
license: apache-2.0
base_model:
- Qwen/Qwen2-VL-2B-Instruct
Requirements
This is compatible with any onnx runtime.
Running this model
Javascript
See https://huggingface.co/spaces/pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16 for a demo.
Python
import os
import sys
import time
import torch
import numpy as np
import requests
import onnxruntime as ort
from PIL import Image
from io import BytesIO
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer
# Constants
DEBUG = True
PRINT = print
# Try importing config, set defaults if not found
try:
from export_config import (
INPUT_IMAGE_SIZE,
IMAGE_RESIZE,
MAX_SEQ_LENGTH,
HEIGHT_FACTOR,
WIDTH_FACTOR
)
except:
INPUT_IMAGE_SIZE = [960, 960]
HEIGHT_FACTOR = 10
WIDTH_FACTOR = 10
IMAGE_RESIZE = [HEIGHT_FACTOR * 28, WIDTH_FACTOR * 28]
MAX_SEQ_LENGTH = 1024
# Command line arguments
model_path = sys.argv[1]
onnx_path = sys.argv[2]
# ONNX model paths
model_paths = {
'A': os.path.join(onnx_path, 'QwenVL_A_q4f16.onnx'),
'B': os.path.join(onnx_path, 'QwenVL_B_q4f16.onnx'),
'C': os.path.join(onnx_path, 'QwenVL_C_q4f16.onnx'),
'D': os.path.join(onnx_path, 'QwenVL_D_q4f16.onnx'),
'E': os.path.join(onnx_path, 'QwenVL_E_q4f16.onnx')
}
PRINT('\n[PATHS] ONNX model paths:')
for key, path in model_paths.items():
PRINT(f" Model {key}: {path}")
# Test image and prompt
TEST_IMAGE_URL = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg'
TEST_PROMPT = 'Describe this image.'
# Initialize model and tokenizer
with torch.inference_mode():
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.float32,
device_map='mps',
low_cpu_mem_usage=DEBUG
)
max_length = MAX_SEQ_LENGTH
num_attention_heads = model.config.num_attention_heads
num_key_value_heads = model.config.num_key_value_heads
head_dim = model.config.hidden_size // num_attention_heads
num_layers = model.config.num_hidden_layers
hidden_size = model.config.hidden_size
MAX_ITERATIONS = 12
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=DEBUG)
# ONNX session options
session_options = ort.SessionOptions()
session_options.log_severity_level = 3
session_options.inter_op_num_threads = 0
session_options.intra_op_num_threads = 0
session_options.enable_cpu_mem_arena = DEBUG
session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session_options.add_session_config_entry('session.intra_op.allow_spinning', '1')
session_options.add_session_config_entry('session.inter_op.allow_spinning', '1')
# Initialize ONNX sessions
sessions = {
'A': ort.InferenceSession(model_paths['A'], sess_options=session_options),
'B': ort.InferenceSession(model_paths['B'], sess_options=session_options),
'C': ort.InferenceSession(model_paths['C'], sess_options=session_options),
'D': ort.InferenceSession(model_paths['D'], sess_options=session_options),
'E': ort.InferenceSession(model_paths['E'], sess_options=session_options)
}
# Get input/output names for each session
inputs = {
'A': {
'input': sessions['A'].get_inputs()[0].name,
'output': sessions['A'].get_outputs()[0].name
},
'B': {
'input_ids': sessions['B'].get_inputs()[0].name,
'input_lengths': sessions['B'].get_inputs()[1].name,
'output': sessions['B'].get_outputs()[0].name
},
'C': {
'input': sessions['C'].get_inputs()[0].name,
'output': sessions['C'].get_outputs()[0].name
},
'D': {
'names': [inp.name for inp in sessions['D'].get_inputs()],
'outputs': [out.name for out in sessions['D'].get_outputs()]
},
'E': {
'names': [inp.name for inp in sessions['E'].get_inputs()],
'outputs': [out.name for out in sessions['E'].get_outputs()]
}
}
# Process image
response = requests.get(TEST_IMAGE_URL)
image = Image.open(BytesIO(response.content))
image = image.resize((INPUT_IMAGE_SIZE[1], INPUT_IMAGE_SIZE[0]))
if image.mode != 'RGB':
image = image.convert('RGB')
image_array = np.transpose(np.array(image).astype(np.float32), (2, 0, 1))
image_array = np.expand_dims(image_array, axis=0) / 255.
use_images = DEBUG
prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{TEST_PROMPT}<|im_end|>\n<|im_start|>assistant\n"
eos_token_id = np.array([5], dtype=np.int64)
total_ids = WIDTH_FACTOR * HEIGHT_FACTOR
# Initialize tensors
input_ids = tokenizer(prompt, return_tensors='pt')['input_ids']
input_lengths = np.array([input_ids.shape[1]], dtype=np.int64)
tokens = np.zeros(max_length, dtype=np.int32)
tokens[:input_lengths[0]] = input_ids[0, :]
position = np.zeros(1, dtype=np.int64)
# Initialize cache tensors
key_cache = np.zeros((num_layers, num_key_value_heads, max_length, head_dim), dtype=np.float16)
value_cache = key_cache.copy()
logits_mask = np.array([-65504.], dtype=np.float16)
position_mask = np.array([.0], dtype=np.float16)
max_total_tokens = 1 - total_ids + WIDTH_FACTOR
batch_size = np.array(0, dtype=np.int32)
# Process initial inputs
hidden_states = sessions['B'].run([inputs['B']['output']],
{inputs['B']['input_ids']: tokens, inputs['B']['input_lengths']: input_lengths})[0]
batch_size, = sessions['C'].run([inputs['C']['output']], {inputs['C']['input']: batch_size})
if use_images:
image_features = sessions['A'].run([inputs['A']['output']], {inputs['A']['input']: image_array})[0]
input_lengths += total_ids
remaining_tokens = np.array(max_length - input_lengths[0] - total_ids, dtype=np.int32)
tokens_to_stop = np.array(input_lengths[0] - eos_token_id[0], dtype=np.int32)
hidden_states, batch_size = sessions['D'].run(
[inputs['D']['outputs'][0], inputs['D']['outputs'][1]],
{
inputs['D']['names'][0]: hidden_states,
inputs['D']['names'][1]: image_features,
inputs['D']['names'][2]: input_lengths,
inputs['D']['names'][3]: tokens_to_stop,
inputs['D']['names'][4]: remaining_tokens
}
)
start_time = time.time()
iterations = 0
while (iterations < MAX_ITERATIONS) & (position < max_length):
token, key_cache, value_cache = sessions['E'].run(
[inputs['E']['outputs'][0], inputs['E']['outputs'][1], inputs['E']['outputs'][2]],
{
inputs['E']['names'][0]: hidden_states,
inputs['E']['names'][1]: logits_mask,
inputs['E']['names'][2]: key_cache,
inputs['E']['names'][3]: value_cache,
inputs['E']['names'][4]: position,
inputs['E']['names'][5]: input_lengths,
inputs['E']['names'][6]: batch_size,
inputs['E']['names'][7]: position_mask
}
)
if (token == 151643) | (token == 151645):
break
else:
iterations += 1
if iterations < 2:
position += input_lengths[0]
input_lengths[0] = 1
logits_mask = np.array([.0], dtype=np.float16)
if use_images:
position_mask = np.array(max_total_tokens + input_lengths[0], dtype=np.float16)
else:
position_mask = np.array(position[0] + 1, dtype=np.float16)
else:
position += 1
position_mask += 1
tokens[0] = token
hidden_states = sessions['B'].run(
[inputs['B']['output']],
{inputs['B']['input_ids']: tokens, inputs['B']['input_lengths']: input_lengths}
)[0]
decoded_token = tokenizer.decode(token)
PRINT(f"Decoded token: {decoded_token}")
PRINT(decoded_token, end='', flush=DEBUG)
total_time = time.time() - start_time