--- license: apache-2.0 base_model: - Qwen/Qwen2-VL-2B-Instruct --- # Requirements This is compatible with any onnx runtime. # Running this model **Javascript** See https://huggingface.co/spaces/pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16 for a demo. **Python** ``` import os import sys import time import torch import numpy as np import requests import onnxruntime as ort from PIL import Image from io import BytesIO from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer # Constants DEBUG = True PRINT = print # Try importing config, set defaults if not found try: from export_config import ( INPUT_IMAGE_SIZE, IMAGE_RESIZE, MAX_SEQ_LENGTH, HEIGHT_FACTOR, WIDTH_FACTOR ) except: INPUT_IMAGE_SIZE = [960, 960] HEIGHT_FACTOR = 10 WIDTH_FACTOR = 10 IMAGE_RESIZE = [HEIGHT_FACTOR * 28, WIDTH_FACTOR * 28] MAX_SEQ_LENGTH = 1024 # Command line arguments model_path = sys.argv[1] onnx_path = sys.argv[2] # ONNX model paths model_paths = { 'A': os.path.join(onnx_path, 'QwenVL_A_q4f16.onnx'), 'B': os.path.join(onnx_path, 'QwenVL_B_q4f16.onnx'), 'C': os.path.join(onnx_path, 'QwenVL_C_q4f16.onnx'), 'D': os.path.join(onnx_path, 'QwenVL_D_q4f16.onnx'), 'E': os.path.join(onnx_path, 'QwenVL_E_q4f16.onnx') } PRINT('\n[PATHS] ONNX model paths:') for key, path in model_paths.items(): PRINT(f" Model {key}: {path}") # Test image and prompt TEST_IMAGE_URL = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg' TEST_PROMPT = 'Describe this image.' # Initialize model and tokenizer with torch.inference_mode(): model = Qwen2VLForConditionalGeneration.from_pretrained( model_path, torch_dtype=torch.float32, device_map='mps', low_cpu_mem_usage=DEBUG ) max_length = MAX_SEQ_LENGTH num_attention_heads = model.config.num_attention_heads num_key_value_heads = model.config.num_key_value_heads head_dim = model.config.hidden_size // num_attention_heads num_layers = model.config.num_hidden_layers hidden_size = model.config.hidden_size MAX_ITERATIONS = 12 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=DEBUG) # ONNX session options session_options = ort.SessionOptions() session_options.log_severity_level = 3 session_options.inter_op_num_threads = 0 session_options.intra_op_num_threads = 0 session_options.enable_cpu_mem_arena = DEBUG session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL session_options.add_session_config_entry('session.intra_op.allow_spinning', '1') session_options.add_session_config_entry('session.inter_op.allow_spinning', '1') # Initialize ONNX sessions sessions = { 'A': ort.InferenceSession(model_paths['A'], sess_options=session_options), 'B': ort.InferenceSession(model_paths['B'], sess_options=session_options), 'C': ort.InferenceSession(model_paths['C'], sess_options=session_options), 'D': ort.InferenceSession(model_paths['D'], sess_options=session_options), 'E': ort.InferenceSession(model_paths['E'], sess_options=session_options) } # Get input/output names for each session inputs = { 'A': { 'input': sessions['A'].get_inputs()[0].name, 'output': sessions['A'].get_outputs()[0].name }, 'B': { 'input_ids': sessions['B'].get_inputs()[0].name, 'input_lengths': sessions['B'].get_inputs()[1].name, 'output': sessions['B'].get_outputs()[0].name }, 'C': { 'input': sessions['C'].get_inputs()[0].name, 'output': sessions['C'].get_outputs()[0].name }, 'D': { 'names': [inp.name for inp in sessions['D'].get_inputs()], 'outputs': [out.name for out in sessions['D'].get_outputs()] }, 'E': { 'names': [inp.name for inp in sessions['E'].get_inputs()], 'outputs': [out.name for out in sessions['E'].get_outputs()] } } # Process image response = requests.get(TEST_IMAGE_URL) image = Image.open(BytesIO(response.content)) image = image.resize((INPUT_IMAGE_SIZE[1], INPUT_IMAGE_SIZE[0])) if image.mode != 'RGB': image = image.convert('RGB') image_array = np.transpose(np.array(image).astype(np.float32), (2, 0, 1)) image_array = np.expand_dims(image_array, axis=0) / 255. use_images = DEBUG prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{TEST_PROMPT}<|im_end|>\n<|im_start|>assistant\n" eos_token_id = np.array([5], dtype=np.int64) total_ids = WIDTH_FACTOR * HEIGHT_FACTOR # Initialize tensors input_ids = tokenizer(prompt, return_tensors='pt')['input_ids'] input_lengths = np.array([input_ids.shape[1]], dtype=np.int64) tokens = np.zeros(max_length, dtype=np.int32) tokens[:input_lengths[0]] = input_ids[0, :] position = np.zeros(1, dtype=np.int64) # Initialize cache tensors key_cache = np.zeros((num_layers, num_key_value_heads, max_length, head_dim), dtype=np.float16) value_cache = key_cache.copy() logits_mask = np.array([-65504.], dtype=np.float16) position_mask = np.array([.0], dtype=np.float16) max_total_tokens = 1 - total_ids + WIDTH_FACTOR batch_size = np.array(0, dtype=np.int32) # Process initial inputs hidden_states = sessions['B'].run([inputs['B']['output']], {inputs['B']['input_ids']: tokens, inputs['B']['input_lengths']: input_lengths})[0] batch_size, = sessions['C'].run([inputs['C']['output']], {inputs['C']['input']: batch_size}) if use_images: image_features = sessions['A'].run([inputs['A']['output']], {inputs['A']['input']: image_array})[0] input_lengths += total_ids remaining_tokens = np.array(max_length - input_lengths[0] - total_ids, dtype=np.int32) tokens_to_stop = np.array(input_lengths[0] - eos_token_id[0], dtype=np.int32) hidden_states, batch_size = sessions['D'].run( [inputs['D']['outputs'][0], inputs['D']['outputs'][1]], { inputs['D']['names'][0]: hidden_states, inputs['D']['names'][1]: image_features, inputs['D']['names'][2]: input_lengths, inputs['D']['names'][3]: tokens_to_stop, inputs['D']['names'][4]: remaining_tokens } ) start_time = time.time() iterations = 0 while (iterations < MAX_ITERATIONS) & (position < max_length): token, key_cache, value_cache = sessions['E'].run( [inputs['E']['outputs'][0], inputs['E']['outputs'][1], inputs['E']['outputs'][2]], { inputs['E']['names'][0]: hidden_states, inputs['E']['names'][1]: logits_mask, inputs['E']['names'][2]: key_cache, inputs['E']['names'][3]: value_cache, inputs['E']['names'][4]: position, inputs['E']['names'][5]: input_lengths, inputs['E']['names'][6]: batch_size, inputs['E']['names'][7]: position_mask } ) if (token == 151643) | (token == 151645): break else: iterations += 1 if iterations < 2: position += input_lengths[0] input_lengths[0] = 1 logits_mask = np.array([.0], dtype=np.float16) if use_images: position_mask = np.array(max_total_tokens + input_lengths[0], dtype=np.float16) else: position_mask = np.array(position[0] + 1, dtype=np.float16) else: position += 1 position_mask += 1 tokens[0] = token hidden_states = sessions['B'].run( [inputs['B']['output']], {inputs['B']['input_ids']: tokens, inputs['B']['input_lengths']: input_lengths} )[0] decoded_token = tokenizer.decode(token) PRINT(f"Decoded token: {decoded_token}") PRINT(decoded_token, end='', flush=DEBUG) total_time = time.time() - start_time ``` # Technical Information: - [EXPORT.md](EXPORT.md)