File size: 7,903 Bytes
5c5a02d
 
 
 
1f5aa72
afc67e5
1f5aa72
 
afc67e5
0018e62
 
 
db8df50
afc67e5
0018e62
 
 
 
2b6e2f8
 
0018e62
 
 
 
2b6e2f8
 
0018e62
2b6e2f8
0018e62
2b6e2f8
 
 
0018e62
2b6e2f8
0018e62
2b6e2f8
 
 
 
 
 
 
0018e62
 
 
 
 
 
 
2b6e2f8
 
 
0018e62
2b6e2f8
 
 
 
 
 
 
 
0018e62
2b6e2f8
 
 
0018e62
2b6e2f8
 
 
0018e62
2b6e2f8
0018e62
2b6e2f8
 
 
 
 
 
 
 
 
0018e62
2b6e2f8
0018e62
 
 
2b6e2f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0018e62
2b6e2f8
0018e62
 
 
2b6e2f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0018e62
2b6e2f8
 
 
 
 
 
 
 
 
 
 
 
 
 
0018e62
2b6e2f8
 
 
 
 
 
 
 
 
 
 
 
0018e62
 
2b6e2f8
 
 
 
 
 
 
0018e62
2b6e2f8
0018e62
2b6e2f8
 
 
 
 
 
 
 
 
 
 
 
 
0018e62
 
afc67e5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
---
license: apache-2.0
base_model:
- Qwen/Qwen2-VL-2B-Instruct
---
# Requirements
This is compatible with any onnx runtime.

# Running this model

**Javascript**

See https://huggingface.co/spaces/pdufour/Qwen2-VL-2B-Instruct-ONNX-Q4-F16 for a demo.


**Python**

```
import os
import sys
import time
import torch
import numpy as np
import requests
import onnxruntime as ort
from PIL import Image
from io import BytesIO
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer

# Constants
DEBUG = True
PRINT = print

# Try importing config, set defaults if not found
try:
    from export_config import (
        INPUT_IMAGE_SIZE,
        IMAGE_RESIZE,
        MAX_SEQ_LENGTH,
        HEIGHT_FACTOR,
        WIDTH_FACTOR
    )
except:
    INPUT_IMAGE_SIZE = [960, 960]
    HEIGHT_FACTOR = 10
    WIDTH_FACTOR = 10
    IMAGE_RESIZE = [HEIGHT_FACTOR * 28, WIDTH_FACTOR * 28]
    MAX_SEQ_LENGTH = 1024

# Command line arguments
model_path = sys.argv[1]
onnx_path = sys.argv[2]

# ONNX model paths
model_paths = {
    'A': os.path.join(onnx_path, 'QwenVL_A_q4f16.onnx'),
    'B': os.path.join(onnx_path, 'QwenVL_B_q4f16.onnx'),
    'C': os.path.join(onnx_path, 'QwenVL_C_q4f16.onnx'),
    'D': os.path.join(onnx_path, 'QwenVL_D_q4f16.onnx'),
    'E': os.path.join(onnx_path, 'QwenVL_E_q4f16.onnx')
}

PRINT('\n[PATHS] ONNX model paths:')
for key, path in model_paths.items():
    PRINT(f"  Model {key}: {path}")

# Test image and prompt
TEST_IMAGE_URL = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg'
TEST_PROMPT = 'Describe this image.'

# Initialize model and tokenizer
with torch.inference_mode():
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.float32,
        device_map='mps',
        low_cpu_mem_usage=DEBUG
    )

    max_length = MAX_SEQ_LENGTH
    num_attention_heads = model.config.num_attention_heads
    num_key_value_heads = model.config.num_key_value_heads
    head_dim = model.config.hidden_size // num_attention_heads
    num_layers = model.config.num_hidden_layers
    hidden_size = model.config.hidden_size

MAX_ITERATIONS = 12
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=DEBUG)

# ONNX session options
session_options = ort.SessionOptions()
session_options.log_severity_level = 3
session_options.inter_op_num_threads = 0
session_options.intra_op_num_threads = 0
session_options.enable_cpu_mem_arena = DEBUG
session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session_options.add_session_config_entry('session.intra_op.allow_spinning', '1')
session_options.add_session_config_entry('session.inter_op.allow_spinning', '1')

# Initialize ONNX sessions
sessions = {
    'A': ort.InferenceSession(model_paths['A'], sess_options=session_options),
    'B': ort.InferenceSession(model_paths['B'], sess_options=session_options),
    'C': ort.InferenceSession(model_paths['C'], sess_options=session_options),
    'D': ort.InferenceSession(model_paths['D'], sess_options=session_options),
    'E': ort.InferenceSession(model_paths['E'], sess_options=session_options)
}

# Get input/output names for each session
inputs = {
    'A': {
        'input': sessions['A'].get_inputs()[0].name,
        'output': sessions['A'].get_outputs()[0].name
    },
    'B': {
        'input_ids': sessions['B'].get_inputs()[0].name,
        'input_lengths': sessions['B'].get_inputs()[1].name,
        'output': sessions['B'].get_outputs()[0].name
    },
    'C': {
        'input': sessions['C'].get_inputs()[0].name,
        'output': sessions['C'].get_outputs()[0].name
    },
    'D': {
        'names': [inp.name for inp in sessions['D'].get_inputs()],
        'outputs': [out.name for out in sessions['D'].get_outputs()]
    },
    'E': {
        'names': [inp.name for inp in sessions['E'].get_inputs()],
        'outputs': [out.name for out in sessions['E'].get_outputs()]
    }
}

# Process image
response = requests.get(TEST_IMAGE_URL)
image = Image.open(BytesIO(response.content))
image = image.resize((INPUT_IMAGE_SIZE[1], INPUT_IMAGE_SIZE[0]))
if image.mode != 'RGB':
    image = image.convert('RGB')

image_array = np.transpose(np.array(image).astype(np.float32), (2, 0, 1))
image_array = np.expand_dims(image_array, axis=0) / 255.

use_images = DEBUG
prompt = f"\n<|im_start|>user\n<|vision_start|><|vision_end|>{TEST_PROMPT}<|im_end|>\n<|im_start|>assistant\n"
eos_token_id = np.array([5], dtype=np.int64)
total_ids = WIDTH_FACTOR * HEIGHT_FACTOR

# Initialize tensors
input_ids = tokenizer(prompt, return_tensors='pt')['input_ids']
input_lengths = np.array([input_ids.shape[1]], dtype=np.int64)
tokens = np.zeros(max_length, dtype=np.int32)
tokens[:input_lengths[0]] = input_ids[0, :]
position = np.zeros(1, dtype=np.int64)

# Initialize cache tensors
key_cache = np.zeros((num_layers, num_key_value_heads, max_length, head_dim), dtype=np.float16)
value_cache = key_cache.copy()
logits_mask = np.array([-65504.], dtype=np.float16)
position_mask = np.array([.0], dtype=np.float16)
max_total_tokens = 1 - total_ids + WIDTH_FACTOR
batch_size = np.array(0, dtype=np.int32)

# Process initial inputs
hidden_states = sessions['B'].run([inputs['B']['output']],
    {inputs['B']['input_ids']: tokens, inputs['B']['input_lengths']: input_lengths})[0]
batch_size, = sessions['C'].run([inputs['C']['output']], {inputs['C']['input']: batch_size})

if use_images:
    image_features = sessions['A'].run([inputs['A']['output']], {inputs['A']['input']: image_array})[0]
    input_lengths += total_ids
    remaining_tokens = np.array(max_length - input_lengths[0] - total_ids, dtype=np.int32)
    tokens_to_stop = np.array(input_lengths[0] - eos_token_id[0], dtype=np.int32)
    hidden_states, batch_size = sessions['D'].run(
        [inputs['D']['outputs'][0], inputs['D']['outputs'][1]],
        {
            inputs['D']['names'][0]: hidden_states,
            inputs['D']['names'][1]: image_features,
            inputs['D']['names'][2]: input_lengths,
            inputs['D']['names'][3]: tokens_to_stop,
            inputs['D']['names'][4]: remaining_tokens
        }
    )

start_time = time.time()
iterations = 0

while (iterations < MAX_ITERATIONS) & (position < max_length):
    token, key_cache, value_cache = sessions['E'].run(
        [inputs['E']['outputs'][0], inputs['E']['outputs'][1], inputs['E']['outputs'][2]],
        {
            inputs['E']['names'][0]: hidden_states,
            inputs['E']['names'][1]: logits_mask,
            inputs['E']['names'][2]: key_cache,
            inputs['E']['names'][3]: value_cache,
            inputs['E']['names'][4]: position,
            inputs['E']['names'][5]: input_lengths,
            inputs['E']['names'][6]: batch_size,
            inputs['E']['names'][7]: position_mask
        }
    )

    if (token == 151643) | (token == 151645):
        break
    else:
        iterations += 1
        if iterations < 2:
            position += input_lengths[0]
            input_lengths[0] = 1
            logits_mask = np.array([.0], dtype=np.float16)
            if use_images:
                position_mask = np.array(max_total_tokens + input_lengths[0], dtype=np.float16)
            else:
                position_mask = np.array(position[0] + 1, dtype=np.float16)
        else:
            position += 1
            position_mask += 1

        tokens[0] = token
        hidden_states = sessions['B'].run(
            [inputs['B']['output']],
            {inputs['B']['input_ids']: tokens, inputs['B']['input_lengths']: input_lengths}
        )[0]
        decoded_token = tokenizer.decode(token)
        PRINT(f"Decoded token: {decoded_token}")
        PRINT(decoded_token, end='', flush=DEBUG)

total_time = time.time() - start_time
```

# Technical Information:
- [EXPORT.md](EXPORT.md)