Devops-hestabit
commited on
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- ensemble/1/.tmp +0 -0
- ensemble/config.pbtxt +470 -0
- postprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
- postprocessing/1/model.py +231 -0
- postprocessing/1/special_tokens_map.json +5 -0
- postprocessing/1/tokenizer.json +0 -0
- postprocessing/1/tokenizer.model +3 -0
- postprocessing/1/tokenizer_config.json +43 -0
- postprocessing/config.pbtxt +113 -0
- preprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
- preprocessing/1/model.py +373 -0
- preprocessing/1/special_tokens_map.json +5 -0
- preprocessing/1/tokenizer.json +0 -0
- preprocessing/1/tokenizer.model +3 -0
- preprocessing/1/tokenizer_config.json +43 -0
- preprocessing/config.pbtxt +156 -0
- tensorrt_llm/1/.gitkeep +0 -0
- tensorrt_llm/1/config.json +148 -0
- tensorrt_llm/1/model.py +782 -0
- tensorrt_llm/1/rank0.engine +3 -0
- tensorrt_llm/1/rank1.engine +3 -0
- tensorrt_llm/config.pbtxt +537 -0
- tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc +0 -0
- tensorrt_llm_bls/1/lib/decode.py +333 -0
- tensorrt_llm_bls/1/lib/triton_decoder.py +440 -0
- tensorrt_llm_bls/1/model.py +131 -0
- tensorrt_llm_bls/config.pbtxt +253 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
|
37 |
+
tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text
|
ensemble/1/.tmp
ADDED
File without changes
|
ensemble/config.pbtxt
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "ensemble"
|
28 |
+
platform: "ensemble"
|
29 |
+
max_batch_size: 16
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "text_input"
|
33 |
+
data_type: TYPE_STRING
|
34 |
+
dims: [ -1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "decoder_text_input"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ -1 ]
|
40 |
+
optional: true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
name: "max_tokens"
|
44 |
+
data_type: TYPE_INT32
|
45 |
+
dims: [ -1 ]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "bad_words"
|
49 |
+
data_type: TYPE_STRING
|
50 |
+
dims: [ -1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "stop_words"
|
55 |
+
data_type: TYPE_STRING
|
56 |
+
dims: [ -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "end_id"
|
61 |
+
data_type: TYPE_INT32
|
62 |
+
dims: [ 1 ]
|
63 |
+
optional: true
|
64 |
+
},
|
65 |
+
{
|
66 |
+
name: "pad_id"
|
67 |
+
data_type: TYPE_INT32
|
68 |
+
dims: [ 1 ]
|
69 |
+
optional: true
|
70 |
+
},
|
71 |
+
{
|
72 |
+
name: "top_k"
|
73 |
+
data_type: TYPE_INT32
|
74 |
+
dims: [ 1 ]
|
75 |
+
optional: true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
name: "top_p"
|
79 |
+
data_type: TYPE_FP32
|
80 |
+
dims: [ 1 ]
|
81 |
+
optional: true
|
82 |
+
},
|
83 |
+
{
|
84 |
+
name: "temperature"
|
85 |
+
data_type: TYPE_FP32
|
86 |
+
dims: [ 1 ]
|
87 |
+
optional: true
|
88 |
+
},
|
89 |
+
{
|
90 |
+
name: "length_penalty"
|
91 |
+
data_type: TYPE_FP32
|
92 |
+
dims: [ 1 ]
|
93 |
+
optional: true
|
94 |
+
},
|
95 |
+
{
|
96 |
+
name: "repetition_penalty"
|
97 |
+
data_type: TYPE_FP32
|
98 |
+
dims: [ 1 ]
|
99 |
+
optional: true
|
100 |
+
},
|
101 |
+
{
|
102 |
+
name: "min_length"
|
103 |
+
data_type: TYPE_INT32
|
104 |
+
dims: [ 1 ]
|
105 |
+
optional: true
|
106 |
+
},
|
107 |
+
{
|
108 |
+
name: "presence_penalty"
|
109 |
+
data_type: TYPE_FP32
|
110 |
+
dims: [ 1 ]
|
111 |
+
optional: true
|
112 |
+
},
|
113 |
+
{
|
114 |
+
name: "frequency_penalty"
|
115 |
+
data_type: TYPE_FP32
|
116 |
+
dims: [ 1 ]
|
117 |
+
optional: true
|
118 |
+
},
|
119 |
+
{
|
120 |
+
name: "random_seed"
|
121 |
+
data_type: TYPE_UINT64
|
122 |
+
dims: [ 1 ]
|
123 |
+
optional: true
|
124 |
+
},
|
125 |
+
{
|
126 |
+
name: "return_log_probs"
|
127 |
+
data_type: TYPE_BOOL
|
128 |
+
dims: [ 1 ]
|
129 |
+
optional: true
|
130 |
+
},
|
131 |
+
{
|
132 |
+
name: "return_context_logits"
|
133 |
+
data_type: TYPE_BOOL
|
134 |
+
dims: [ 1 ]
|
135 |
+
optional: true
|
136 |
+
},
|
137 |
+
{
|
138 |
+
name: "return_generation_logits"
|
139 |
+
data_type: TYPE_BOOL
|
140 |
+
dims: [ 1 ]
|
141 |
+
optional: true
|
142 |
+
},
|
143 |
+
{
|
144 |
+
name: "beam_width"
|
145 |
+
data_type: TYPE_INT32
|
146 |
+
dims: [ 1 ]
|
147 |
+
optional: true
|
148 |
+
},
|
149 |
+
{
|
150 |
+
name: "stream"
|
151 |
+
data_type: TYPE_BOOL
|
152 |
+
dims: [ 1 ]
|
153 |
+
optional: true
|
154 |
+
},
|
155 |
+
{
|
156 |
+
name: "prompt_embedding_table"
|
157 |
+
data_type: TYPE_FP16
|
158 |
+
dims: [ -1, -1 ]
|
159 |
+
optional: true
|
160 |
+
},
|
161 |
+
{
|
162 |
+
name: "prompt_vocab_size"
|
163 |
+
data_type: TYPE_INT32
|
164 |
+
dims: [ 1 ]
|
165 |
+
optional: true
|
166 |
+
},
|
167 |
+
{
|
168 |
+
name: "embedding_bias_words"
|
169 |
+
data_type: TYPE_STRING
|
170 |
+
dims: [ -1 ]
|
171 |
+
optional: true
|
172 |
+
},
|
173 |
+
{
|
174 |
+
name: "embedding_bias_weights"
|
175 |
+
data_type: TYPE_FP32
|
176 |
+
dims: [ -1 ]
|
177 |
+
optional: true
|
178 |
+
}
|
179 |
+
]
|
180 |
+
output [
|
181 |
+
{
|
182 |
+
name: "text_output"
|
183 |
+
data_type: TYPE_STRING
|
184 |
+
dims: [ -1 ]
|
185 |
+
},
|
186 |
+
{
|
187 |
+
name: "cum_log_probs"
|
188 |
+
data_type: TYPE_FP32
|
189 |
+
dims: [ -1 ]
|
190 |
+
},
|
191 |
+
{
|
192 |
+
name: "output_log_probs"
|
193 |
+
data_type: TYPE_FP32
|
194 |
+
dims: [ -1, -1 ]
|
195 |
+
},
|
196 |
+
{
|
197 |
+
name: "context_logits"
|
198 |
+
data_type: TYPE_FP32
|
199 |
+
dims: [ -1, -1 ]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
name: "generation_logits"
|
203 |
+
data_type: TYPE_FP32
|
204 |
+
dims: [ -1, -1, -1 ]
|
205 |
+
}
|
206 |
+
]
|
207 |
+
ensemble_scheduling {
|
208 |
+
step [
|
209 |
+
{
|
210 |
+
model_name: "preprocessing"
|
211 |
+
model_version: -1
|
212 |
+
input_map {
|
213 |
+
key: "QUERY"
|
214 |
+
value: "text_input"
|
215 |
+
}
|
216 |
+
input_map {
|
217 |
+
key: "DECODER_QUERY"
|
218 |
+
value: "decoder_text_input"
|
219 |
+
}
|
220 |
+
input_map {
|
221 |
+
key: "REQUEST_OUTPUT_LEN"
|
222 |
+
value: "max_tokens"
|
223 |
+
}
|
224 |
+
input_map {
|
225 |
+
key: "BAD_WORDS_DICT"
|
226 |
+
value: "bad_words"
|
227 |
+
}
|
228 |
+
input_map {
|
229 |
+
key: "STOP_WORDS_DICT"
|
230 |
+
value: "stop_words"
|
231 |
+
}
|
232 |
+
input_map {
|
233 |
+
key: "EMBEDDING_BIAS_WORDS"
|
234 |
+
value: "embedding_bias_words"
|
235 |
+
}
|
236 |
+
input_map {
|
237 |
+
key: "EMBEDDING_BIAS_WEIGHTS"
|
238 |
+
value: "embedding_bias_weights"
|
239 |
+
}
|
240 |
+
input_map {
|
241 |
+
key: "END_ID"
|
242 |
+
value: "end_id"
|
243 |
+
}
|
244 |
+
input_map {
|
245 |
+
key: "PAD_ID"
|
246 |
+
value: "pad_id"
|
247 |
+
}
|
248 |
+
output_map {
|
249 |
+
key: "REQUEST_INPUT_LEN"
|
250 |
+
value: "_REQUEST_INPUT_LEN"
|
251 |
+
}
|
252 |
+
output_map {
|
253 |
+
key: "INPUT_ID"
|
254 |
+
value: "_INPUT_ID"
|
255 |
+
}
|
256 |
+
output_map {
|
257 |
+
key: "REQUEST_DECODER_INPUT_LEN"
|
258 |
+
value: "_REQUEST_DECODER_INPUT_LEN"
|
259 |
+
}
|
260 |
+
output_map {
|
261 |
+
key: "DECODER_INPUT_ID"
|
262 |
+
value: "_DECODER_INPUT_ID"
|
263 |
+
}
|
264 |
+
output_map {
|
265 |
+
key: "REQUEST_OUTPUT_LEN"
|
266 |
+
value: "_REQUEST_OUTPUT_LEN"
|
267 |
+
}
|
268 |
+
output_map {
|
269 |
+
key: "STOP_WORDS_IDS"
|
270 |
+
value: "_STOP_WORDS_IDS"
|
271 |
+
}
|
272 |
+
output_map {
|
273 |
+
key: "BAD_WORDS_IDS"
|
274 |
+
value: "_BAD_WORDS_IDS"
|
275 |
+
}
|
276 |
+
output_map {
|
277 |
+
key: "EMBEDDING_BIAS"
|
278 |
+
value: "_EMBEDDING_BIAS"
|
279 |
+
}
|
280 |
+
output_map {
|
281 |
+
key: "OUT_END_ID"
|
282 |
+
value: "_PREPROCESSOR_END_ID"
|
283 |
+
}
|
284 |
+
output_map {
|
285 |
+
key: "OUT_PAD_ID"
|
286 |
+
value: "_PREPROCESSOR_PAD_ID"
|
287 |
+
}
|
288 |
+
},
|
289 |
+
{
|
290 |
+
model_name: "tensorrt_llm"
|
291 |
+
model_version: -1
|
292 |
+
input_map {
|
293 |
+
key: "input_ids"
|
294 |
+
value: "_INPUT_ID"
|
295 |
+
}
|
296 |
+
input_map {
|
297 |
+
key: "decoder_input_ids"
|
298 |
+
value: "_DECODER_INPUT_ID"
|
299 |
+
}
|
300 |
+
input_map {
|
301 |
+
key: "input_lengths"
|
302 |
+
value: "_REQUEST_INPUT_LEN"
|
303 |
+
}
|
304 |
+
input_map {
|
305 |
+
key: "decoder_input_lengths"
|
306 |
+
value: "_REQUEST_DECODER_INPUT_LEN"
|
307 |
+
}
|
308 |
+
input_map {
|
309 |
+
key: "request_output_len"
|
310 |
+
value: "_REQUEST_OUTPUT_LEN"
|
311 |
+
}
|
312 |
+
input_map {
|
313 |
+
key: "end_id"
|
314 |
+
value: "_PREPROCESSOR_END_ID"
|
315 |
+
}
|
316 |
+
input_map {
|
317 |
+
key: "pad_id"
|
318 |
+
value: "_PREPROCESSOR_PAD_ID"
|
319 |
+
}
|
320 |
+
input_map {
|
321 |
+
key: "embedding_bias"
|
322 |
+
value: "_EMBEDDING_BIAS"
|
323 |
+
}
|
324 |
+
input_map {
|
325 |
+
key: "runtime_top_k"
|
326 |
+
value: "top_k"
|
327 |
+
}
|
328 |
+
input_map {
|
329 |
+
key: "runtime_top_p"
|
330 |
+
value: "top_p"
|
331 |
+
}
|
332 |
+
input_map {
|
333 |
+
key: "temperature"
|
334 |
+
value: "temperature"
|
335 |
+
}
|
336 |
+
input_map {
|
337 |
+
key: "len_penalty"
|
338 |
+
value: "length_penalty"
|
339 |
+
}
|
340 |
+
input_map {
|
341 |
+
key: "repetition_penalty"
|
342 |
+
value: "repetition_penalty"
|
343 |
+
}
|
344 |
+
input_map {
|
345 |
+
key: "min_length"
|
346 |
+
value: "min_length"
|
347 |
+
}
|
348 |
+
input_map {
|
349 |
+
key: "presence_penalty"
|
350 |
+
value: "presence_penalty"
|
351 |
+
}
|
352 |
+
input_map {
|
353 |
+
key: "frequency_penalty"
|
354 |
+
value: "frequency_penalty"
|
355 |
+
}
|
356 |
+
input_map {
|
357 |
+
key: "random_seed"
|
358 |
+
value: "random_seed"
|
359 |
+
}
|
360 |
+
input_map {
|
361 |
+
key: "return_log_probs"
|
362 |
+
value: "return_log_probs"
|
363 |
+
}
|
364 |
+
input_map {
|
365 |
+
key: "return_context_logits"
|
366 |
+
value: "return_context_logits"
|
367 |
+
}
|
368 |
+
input_map {
|
369 |
+
key: "return_generation_logits"
|
370 |
+
value: "return_generation_logits"
|
371 |
+
}
|
372 |
+
input_map {
|
373 |
+
key: "beam_width"
|
374 |
+
value: "beam_width"
|
375 |
+
}
|
376 |
+
input_map {
|
377 |
+
key: "streaming"
|
378 |
+
value: "stream"
|
379 |
+
}
|
380 |
+
input_map {
|
381 |
+
key: "prompt_embedding_table"
|
382 |
+
value: "prompt_embedding_table"
|
383 |
+
}
|
384 |
+
input_map {
|
385 |
+
key: "prompt_vocab_size"
|
386 |
+
value: "prompt_vocab_size"
|
387 |
+
}
|
388 |
+
input_map {
|
389 |
+
key: "stop_words_list"
|
390 |
+
value: "_STOP_WORDS_IDS"
|
391 |
+
}
|
392 |
+
input_map {
|
393 |
+
key: "bad_words_list"
|
394 |
+
value: "_BAD_WORDS_IDS"
|
395 |
+
}
|
396 |
+
output_map {
|
397 |
+
key: "output_ids"
|
398 |
+
value: "_TOKENS_BATCH"
|
399 |
+
}
|
400 |
+
output_map {
|
401 |
+
key: "sequence_length"
|
402 |
+
value: "_SEQUENCE_LENGTH"
|
403 |
+
},
|
404 |
+
output_map {
|
405 |
+
key: "cum_log_probs"
|
406 |
+
value: "_CUM_LOG_PROBS"
|
407 |
+
}
|
408 |
+
output_map {
|
409 |
+
key: "output_log_probs"
|
410 |
+
value: "_OUTPUT_LOG_PROBS"
|
411 |
+
},
|
412 |
+
output_map {
|
413 |
+
key: "context_logits"
|
414 |
+
value: "_CONTEXT_LOGITS"
|
415 |
+
},
|
416 |
+
output_map {
|
417 |
+
key: "generation_logits"
|
418 |
+
value: "_GENERATION_LOGITS"
|
419 |
+
}
|
420 |
+
},
|
421 |
+
{
|
422 |
+
model_name: "postprocessing"
|
423 |
+
model_version: -1
|
424 |
+
input_map {
|
425 |
+
key: "TOKENS_BATCH"
|
426 |
+
value: "_TOKENS_BATCH"
|
427 |
+
}
|
428 |
+
input_map {
|
429 |
+
key: "CUM_LOG_PROBS"
|
430 |
+
value: "_CUM_LOG_PROBS"
|
431 |
+
}
|
432 |
+
input_map {
|
433 |
+
key: "OUTPUT_LOG_PROBS"
|
434 |
+
value: "_OUTPUT_LOG_PROBS"
|
435 |
+
}
|
436 |
+
input_map {
|
437 |
+
key: "CONTEXT_LOGITS"
|
438 |
+
value: "_CONTEXT_LOGITS"
|
439 |
+
}
|
440 |
+
input_map {
|
441 |
+
key: "GENERATION_LOGITS"
|
442 |
+
value: "_GENERATION_LOGITS"
|
443 |
+
}
|
444 |
+
input_map {
|
445 |
+
key: "SEQUENCE_LENGTH"
|
446 |
+
value: "_SEQUENCE_LENGTH"
|
447 |
+
}
|
448 |
+
output_map {
|
449 |
+
key: "OUTPUT"
|
450 |
+
value: "text_output"
|
451 |
+
}
|
452 |
+
output_map {
|
453 |
+
key: "OUT_OUTPUT_LOG_PROBS"
|
454 |
+
value: "output_log_probs"
|
455 |
+
}
|
456 |
+
output_map {
|
457 |
+
key: "OUT_CUM_LOG_PROBS"
|
458 |
+
value: "cum_log_probs"
|
459 |
+
}
|
460 |
+
output_map {
|
461 |
+
key: "OUT_CONTEXT_LOGITS"
|
462 |
+
value: "context_logits"
|
463 |
+
}
|
464 |
+
output_map {
|
465 |
+
key: "OUT_GENERATION_LOGITS"
|
466 |
+
value: "generation_logits"
|
467 |
+
}
|
468 |
+
}
|
469 |
+
]
|
470 |
+
}
|
postprocessing/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (5.33 kB). View file
|
|
postprocessing/1/model.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
|
29 |
+
import numpy as np
|
30 |
+
import triton_python_backend_utils as pb_utils
|
31 |
+
from transformers import AutoTokenizer
|
32 |
+
|
33 |
+
|
34 |
+
class TritonPythonModel:
|
35 |
+
"""Your Python model must use the same class name. Every Python model
|
36 |
+
that is created must have "TritonPythonModel" as the class name.
|
37 |
+
"""
|
38 |
+
|
39 |
+
def initialize(self, args):
|
40 |
+
"""`initialize` is called only once when the model is being loaded.
|
41 |
+
Implementing `initialize` function is optional. This function allows
|
42 |
+
the model to initialize any state associated with this model.
|
43 |
+
Parameters
|
44 |
+
----------
|
45 |
+
args : dict
|
46 |
+
Both keys and values are strings. The dictionary keys and values are:
|
47 |
+
* model_config: A JSON string containing the model configuration
|
48 |
+
* model_instance_kind: A string containing model instance kind
|
49 |
+
* model_instance_device_id: A string containing model instance device ID
|
50 |
+
* model_repository: Model repository path
|
51 |
+
* model_version: Model version
|
52 |
+
* model_name: Model name
|
53 |
+
"""
|
54 |
+
# Parse model configs
|
55 |
+
model_config = json.loads(args['model_config'])
|
56 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
57 |
+
'string_value']
|
58 |
+
|
59 |
+
skip_special_tokens = model_config['parameters'].get(
|
60 |
+
'skip_special_tokens')
|
61 |
+
if skip_special_tokens is not None:
|
62 |
+
skip_special_tokens_str = skip_special_tokens[
|
63 |
+
'string_value'].lower()
|
64 |
+
if skip_special_tokens_str in [
|
65 |
+
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
|
66 |
+
]:
|
67 |
+
self.skip_special_tokens = skip_special_tokens_str in [
|
68 |
+
'true', '1', 't', 'y', 'yes'
|
69 |
+
]
|
70 |
+
else:
|
71 |
+
print(
|
72 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
|
73 |
+
)
|
74 |
+
self.skip_special_tokens = True
|
75 |
+
else:
|
76 |
+
print(
|
77 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
|
78 |
+
)
|
79 |
+
self.skip_special_tokens = True
|
80 |
+
|
81 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
82 |
+
legacy=False,
|
83 |
+
padding_side='left',
|
84 |
+
trust_remote_code=True)
|
85 |
+
if not self.tokenizer.pad_token:
|
86 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
87 |
+
|
88 |
+
# Parse model output configs
|
89 |
+
output_config = pb_utils.get_output_config_by_name(
|
90 |
+
model_config, "OUTPUT")
|
91 |
+
|
92 |
+
# Convert Triton types to numpy types
|
93 |
+
self.output_dtype = pb_utils.triton_string_to_numpy(
|
94 |
+
output_config['data_type'])
|
95 |
+
|
96 |
+
def execute(self, requests):
|
97 |
+
"""`execute` must be implemented in every Python model. `execute`
|
98 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
99 |
+
argument. This function is called when an inference is requested
|
100 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
101 |
+
Batching) used, `requests` may contain multiple requests. Every
|
102 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
103 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
104 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
105 |
+
Parameters
|
106 |
+
----------
|
107 |
+
requests : list
|
108 |
+
A list of pb_utils.InferenceRequest
|
109 |
+
Returns
|
110 |
+
-------
|
111 |
+
list
|
112 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
113 |
+
be the same as `requests`
|
114 |
+
"""
|
115 |
+
|
116 |
+
responses = []
|
117 |
+
|
118 |
+
# Every Python backend must iterate over everyone of the requests
|
119 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
120 |
+
for idx, request in enumerate(requests):
|
121 |
+
# Get input tensors
|
122 |
+
tokens_batch = pb_utils.get_input_tensor_by_name(
|
123 |
+
request, 'TOKENS_BATCH').as_numpy()
|
124 |
+
|
125 |
+
# Get sequence length
|
126 |
+
sequence_lengths = pb_utils.get_input_tensor_by_name(
|
127 |
+
request, 'SEQUENCE_LENGTH').as_numpy()
|
128 |
+
|
129 |
+
# Get cum log probs
|
130 |
+
cum_log_probs = pb_utils.get_input_tensor_by_name(
|
131 |
+
request, 'CUM_LOG_PROBS')
|
132 |
+
|
133 |
+
# Get sequence length
|
134 |
+
output_log_probs = pb_utils.get_input_tensor_by_name(
|
135 |
+
request, 'OUTPUT_LOG_PROBS')
|
136 |
+
|
137 |
+
# Get context logits
|
138 |
+
context_logits = pb_utils.get_input_tensor_by_name(
|
139 |
+
request, 'CONTEXT_LOGITS')
|
140 |
+
|
141 |
+
# Get generation logits
|
142 |
+
generation_logits = pb_utils.get_input_tensor_by_name(
|
143 |
+
request, 'GENERATION_LOGITS')
|
144 |
+
|
145 |
+
# Reshape Input
|
146 |
+
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
|
147 |
+
# tokens_batch = tokens_batch.T
|
148 |
+
|
149 |
+
# Postprocessing output data.
|
150 |
+
outputs = self._postprocessing(tokens_batch, sequence_lengths)
|
151 |
+
|
152 |
+
# Create output tensors. You need pb_utils.Tensor
|
153 |
+
# objects to create pb_utils.InferenceResponse.
|
154 |
+
output_tensor = pb_utils.Tensor(
|
155 |
+
'OUTPUT',
|
156 |
+
np.array(outputs).astype(self.output_dtype))
|
157 |
+
|
158 |
+
outputs = []
|
159 |
+
outputs.append(output_tensor)
|
160 |
+
|
161 |
+
if cum_log_probs:
|
162 |
+
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
|
163 |
+
cum_log_probs.as_numpy())
|
164 |
+
outputs.append(out_cum_log_probs)
|
165 |
+
else:
|
166 |
+
out_cum_log_probs = pb_utils.Tensor(
|
167 |
+
'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
|
168 |
+
outputs.append(out_cum_log_probs)
|
169 |
+
|
170 |
+
if output_log_probs:
|
171 |
+
out_output_log_probs = pb_utils.Tensor(
|
172 |
+
'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
|
173 |
+
outputs.append(out_output_log_probs)
|
174 |
+
else:
|
175 |
+
out_output_log_probs = pb_utils.Tensor(
|
176 |
+
'OUT_OUTPUT_LOG_PROBS',
|
177 |
+
np.array([[[0.0]]], dtype=np.float32))
|
178 |
+
outputs.append(out_output_log_probs)
|
179 |
+
|
180 |
+
if context_logits:
|
181 |
+
out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
|
182 |
+
context_logits.as_numpy())
|
183 |
+
outputs.append(out_context_logits)
|
184 |
+
else:
|
185 |
+
out_context_logits = pb_utils.Tensor(
|
186 |
+
'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
|
187 |
+
dtype=np.float32))
|
188 |
+
outputs.append(out_context_logits)
|
189 |
+
|
190 |
+
if generation_logits:
|
191 |
+
out_generation_logits = pb_utils.Tensor(
|
192 |
+
'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
|
193 |
+
outputs.append(out_generation_logits)
|
194 |
+
else:
|
195 |
+
out_generation_logits = pb_utils.Tensor(
|
196 |
+
'OUT_GENERATION_LOGITS',
|
197 |
+
np.array([[[[0.0]]]], dtype=np.float32))
|
198 |
+
outputs.append(out_generation_logits)
|
199 |
+
|
200 |
+
# Create InferenceResponse. You can set an error here in case
|
201 |
+
# there was a problem with handling this inference request.
|
202 |
+
# Below is an example of how you can set errors in inference
|
203 |
+
# response:
|
204 |
+
#
|
205 |
+
# pb_utils.InferenceResponse(
|
206 |
+
# output_tensors=..., TritonError("An error occurred"))
|
207 |
+
inference_response = pb_utils.InferenceResponse(
|
208 |
+
output_tensors=outputs)
|
209 |
+
responses.append(inference_response)
|
210 |
+
|
211 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
212 |
+
# of this list must match the length of `requests` list.
|
213 |
+
return responses
|
214 |
+
|
215 |
+
def finalize(self):
|
216 |
+
"""`finalize` is called only once when the model is being unloaded.
|
217 |
+
Implementing `finalize` function is optional. This function allows
|
218 |
+
the model to perform any necessary clean ups before exit.
|
219 |
+
"""
|
220 |
+
print('Cleaning up...')
|
221 |
+
|
222 |
+
def _postprocessing(self, tokens_batch, sequence_lengths):
|
223 |
+
outputs = []
|
224 |
+
for batch_idx, beam_tokens in enumerate(tokens_batch):
|
225 |
+
for beam_idx, tokens in enumerate(beam_tokens):
|
226 |
+
seq_len = sequence_lengths[batch_idx][beam_idx]
|
227 |
+
output = self.tokenizer.decode(
|
228 |
+
tokens[:seq_len],
|
229 |
+
skip_special_tokens=self.skip_special_tokens)
|
230 |
+
outputs.append(output.encode('utf8'))
|
231 |
+
return outputs
|
postprocessing/1/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
postprocessing/1/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
postprocessing/1/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
|
3 |
+
size 493443
|
postprocessing/1/tokenizer_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"additional_special_tokens": [],
|
31 |
+
"bos_token": "<s>",
|
32 |
+
"clean_up_tokenization_spaces": false,
|
33 |
+
"eos_token": "</s>",
|
34 |
+
"legacy": true,
|
35 |
+
"model_max_length": 1000000000000000019884624838656,
|
36 |
+
"pad_token": null,
|
37 |
+
"sp_model_kwargs": {},
|
38 |
+
"spaces_between_special_tokens": false,
|
39 |
+
"tokenizer_class": "LlamaTokenizer",
|
40 |
+
"unk_token": "<unk>",
|
41 |
+
"use_default_system_prompt": false,
|
42 |
+
"chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
43 |
+
}
|
postprocessing/config.pbtxt
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "postprocessing"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 16
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "TOKENS_BATCH"
|
33 |
+
data_type: TYPE_INT32
|
34 |
+
dims: [ -1, -1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "SEQUENCE_LENGTH"
|
38 |
+
data_type: TYPE_INT32
|
39 |
+
dims: [ -1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "CUM_LOG_PROBS"
|
43 |
+
data_type: TYPE_FP32
|
44 |
+
dims: [ -1 ]
|
45 |
+
optional: true
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "OUTPUT_LOG_PROBS"
|
49 |
+
data_type: TYPE_FP32
|
50 |
+
dims: [ -1, -1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "CONTEXT_LOGITS"
|
55 |
+
data_type: TYPE_FP32
|
56 |
+
dims: [ -1, -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "GENERATION_LOGITS"
|
61 |
+
data_type: TYPE_FP32
|
62 |
+
dims: [ -1, -1, -1 ]
|
63 |
+
optional: true
|
64 |
+
}
|
65 |
+
]
|
66 |
+
output [
|
67 |
+
{
|
68 |
+
name: "OUTPUT"
|
69 |
+
data_type: TYPE_STRING
|
70 |
+
dims: [ -1 ]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
name: "OUT_CUM_LOG_PROBS"
|
74 |
+
data_type: TYPE_FP32
|
75 |
+
dims: [ -1 ]
|
76 |
+
},
|
77 |
+
{
|
78 |
+
name: "OUT_OUTPUT_LOG_PROBS"
|
79 |
+
data_type: TYPE_FP32
|
80 |
+
dims: [ -1, -1 ]
|
81 |
+
},
|
82 |
+
{
|
83 |
+
name: "OUT_CONTEXT_LOGITS"
|
84 |
+
data_type: TYPE_FP32
|
85 |
+
dims: [ -1, -1 ]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
name: "OUT_GENERATION_LOGITS"
|
89 |
+
data_type: TYPE_FP32
|
90 |
+
dims: [ -1, -1, -1 ]
|
91 |
+
}
|
92 |
+
]
|
93 |
+
|
94 |
+
parameters {
|
95 |
+
key: "tokenizer_dir"
|
96 |
+
value: {
|
97 |
+
string_value: "/all_models/inflight_batcher_llm/postprocessing/1"
|
98 |
+
}
|
99 |
+
}
|
100 |
+
|
101 |
+
parameters {
|
102 |
+
key: "skip_special_tokens"
|
103 |
+
value: {
|
104 |
+
string_value: "${skip_special_tokens}"
|
105 |
+
}
|
106 |
+
}
|
107 |
+
|
108 |
+
instance_group [
|
109 |
+
{
|
110 |
+
count: 1
|
111 |
+
kind: KIND_CPU
|
112 |
+
}
|
113 |
+
]
|
preprocessing/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (9.56 kB). View file
|
|
preprocessing/1/model.py
ADDED
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
from typing import List
|
29 |
+
|
30 |
+
import numpy as np
|
31 |
+
import triton_python_backend_utils as pb_utils
|
32 |
+
from transformers import AutoTokenizer, T5Tokenizer
|
33 |
+
|
34 |
+
|
35 |
+
class TritonPythonModel:
|
36 |
+
"""Your Python model must use the same class name. Every Python model
|
37 |
+
that is created must have "TritonPythonModel" as the class name.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def initialize(self, args):
|
41 |
+
"""`initialize` is called only once when the model is being loaded.
|
42 |
+
Implementing `initialize` function is optional. This function allows
|
43 |
+
the model to initialize any state associated with this model.
|
44 |
+
Parameters
|
45 |
+
----------
|
46 |
+
args : dict
|
47 |
+
Both keys and values are strings. The dictionary keys and values are:
|
48 |
+
* model_config: A JSON string containing the model configuration
|
49 |
+
* model_instance_kind: A string containing model instance kind
|
50 |
+
* model_instance_device_id: A string containing model instance device ID
|
51 |
+
* model_repository: Model repository path
|
52 |
+
* model_version: Model version
|
53 |
+
* model_name: Model name
|
54 |
+
"""
|
55 |
+
# Parse model configs
|
56 |
+
model_config = json.loads(args['model_config'])
|
57 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
58 |
+
'string_value']
|
59 |
+
|
60 |
+
add_special_tokens = model_config['parameters'].get(
|
61 |
+
'add_special_tokens')
|
62 |
+
if add_special_tokens is not None:
|
63 |
+
add_special_tokens_str = add_special_tokens['string_value'].lower()
|
64 |
+
if add_special_tokens_str in [
|
65 |
+
'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
|
66 |
+
]:
|
67 |
+
self.add_special_tokens = add_special_tokens_str in [
|
68 |
+
'true', '1', 't', 'y', 'yes'
|
69 |
+
]
|
70 |
+
else:
|
71 |
+
print(
|
72 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
|
73 |
+
)
|
74 |
+
self.add_special_tokens = True
|
75 |
+
else:
|
76 |
+
print(
|
77 |
+
f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
|
78 |
+
)
|
79 |
+
self.add_special_tokens = True
|
80 |
+
|
81 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
82 |
+
legacy=False,
|
83 |
+
padding_side='left',
|
84 |
+
trust_remote_code=True)
|
85 |
+
if isinstance(self.tokenizer, T5Tokenizer):
|
86 |
+
self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
|
87 |
+
|
88 |
+
if not self.tokenizer.pad_token:
|
89 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
90 |
+
|
91 |
+
self.tokenizer_end_id = self.tokenizer.encode(
|
92 |
+
self.tokenizer.eos_token, add_special_tokens=False)[0]
|
93 |
+
self.tokenizer_pad_id = self.tokenizer.encode(
|
94 |
+
self.tokenizer.pad_token, add_special_tokens=False)[0]
|
95 |
+
|
96 |
+
# Parse model output configs and convert Triton types to numpy types
|
97 |
+
output_names = [
|
98 |
+
"INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
|
99 |
+
"REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
|
100 |
+
"OUT_END_ID", "OUT_PAD_ID"
|
101 |
+
]
|
102 |
+
input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
|
103 |
+
for input_name in input_names:
|
104 |
+
setattr(
|
105 |
+
self,
|
106 |
+
input_name.lower() + "_dtype",
|
107 |
+
pb_utils.triton_string_to_numpy(
|
108 |
+
pb_utils.get_input_config_by_name(
|
109 |
+
model_config, input_name)['data_type']))
|
110 |
+
|
111 |
+
for output_name in output_names:
|
112 |
+
setattr(
|
113 |
+
self,
|
114 |
+
output_name.lower() + "_dtype",
|
115 |
+
pb_utils.triton_string_to_numpy(
|
116 |
+
pb_utils.get_output_config_by_name(
|
117 |
+
model_config, output_name)['data_type']))
|
118 |
+
|
119 |
+
def execute(self, requests):
|
120 |
+
"""`execute` must be implemented in every Python model. `execute`
|
121 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
122 |
+
argument. This function is called when an inference is requested
|
123 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
124 |
+
Batching) used, `requests` may contain multiple requests. Every
|
125 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
126 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
127 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
128 |
+
Parameters
|
129 |
+
----------
|
130 |
+
requests : list
|
131 |
+
A list of pb_utils.InferenceRequest
|
132 |
+
Returns
|
133 |
+
-------
|
134 |
+
list
|
135 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
136 |
+
be the same as `requests`
|
137 |
+
"""
|
138 |
+
|
139 |
+
responses = []
|
140 |
+
|
141 |
+
# Every Python backend must iterate over everyone of the requests
|
142 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
143 |
+
logger = pb_utils.Logger
|
144 |
+
for idx, request in enumerate(requests):
|
145 |
+
# Get input tensors
|
146 |
+
query = pb_utils.get_input_tensor_by_name(request,
|
147 |
+
'QUERY').as_numpy()
|
148 |
+
decoder_query = pb_utils.get_input_tensor_by_name(
|
149 |
+
request, 'DECODER_QUERY')
|
150 |
+
if decoder_query is not None:
|
151 |
+
decoder_query = decoder_query.as_numpy()
|
152 |
+
|
153 |
+
batch_dim = query.shape[0]
|
154 |
+
if batch_dim != 1:
|
155 |
+
|
156 |
+
err_str = "Inflight batching backend expects requests with batch size of 1."
|
157 |
+
logger.log_error(err_str)
|
158 |
+
responses.append(
|
159 |
+
pb_utils.InferenceResponse(
|
160 |
+
output_tensors=[],
|
161 |
+
error=pb_utils.TritonError(err_str)))
|
162 |
+
continue
|
163 |
+
|
164 |
+
request_output_len = pb_utils.get_input_tensor_by_name(
|
165 |
+
request, 'REQUEST_OUTPUT_LEN').as_numpy()
|
166 |
+
|
167 |
+
bad_words_dict = pb_utils.get_input_tensor_by_name(
|
168 |
+
request, 'BAD_WORDS_DICT')
|
169 |
+
if bad_words_dict is not None:
|
170 |
+
bad_words_dict = bad_words_dict.as_numpy()
|
171 |
+
|
172 |
+
stop_words_dict = pb_utils.get_input_tensor_by_name(
|
173 |
+
request, 'STOP_WORDS_DICT')
|
174 |
+
if stop_words_dict is not None:
|
175 |
+
stop_words_dict = stop_words_dict.as_numpy()
|
176 |
+
|
177 |
+
embedding_bias_words = pb_utils.get_input_tensor_by_name(
|
178 |
+
request, 'EMBEDDING_BIAS_WORDS')
|
179 |
+
if embedding_bias_words is not None:
|
180 |
+
embedding_bias_words = embedding_bias_words.as_numpy()
|
181 |
+
|
182 |
+
embedding_bias_weights = pb_utils.get_input_tensor_by_name(
|
183 |
+
request, 'EMBEDDING_BIAS_WEIGHTS')
|
184 |
+
if embedding_bias_weights is not None:
|
185 |
+
embedding_bias_weights = embedding_bias_weights.as_numpy()
|
186 |
+
|
187 |
+
# Take the end_id from the input tensors
|
188 |
+
# If not specified, use tokenizer to get end_id
|
189 |
+
end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
|
190 |
+
if end_id is not None:
|
191 |
+
end_id = end_id.as_numpy()
|
192 |
+
else:
|
193 |
+
end_id = [[self.tokenizer_end_id]]
|
194 |
+
|
195 |
+
# Take the pad_id from the input tensors
|
196 |
+
# If not specified, use tokenizer to get pad_id
|
197 |
+
pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
|
198 |
+
if pad_id is not None:
|
199 |
+
pad_id = pad_id.as_numpy()
|
200 |
+
else:
|
201 |
+
pad_id = [[self.tokenizer_pad_id]]
|
202 |
+
|
203 |
+
# Preprocessing input data.
|
204 |
+
input_id, request_input_len = self._create_request(query)
|
205 |
+
print(input_id)
|
206 |
+
print(request_input_len)
|
207 |
+
if decoder_query is not None:
|
208 |
+
decoder_input_id, request_decoder_input_len = self._create_request(
|
209 |
+
decoder_query)
|
210 |
+
else:
|
211 |
+
decoder_input_id = pad_id * np.ones((1, 1), np.int32)
|
212 |
+
request_decoder_input_len = 1 * np.ones((1, 1), np.int32)
|
213 |
+
|
214 |
+
bad_words = self._to_word_list_format(bad_words_dict)
|
215 |
+
stop_words = self._to_word_list_format(stop_words_dict)
|
216 |
+
|
217 |
+
embedding_bias = self._get_embedding_bias(
|
218 |
+
embedding_bias_words, embedding_bias_weights,
|
219 |
+
self.embedding_bias_weights_dtype)
|
220 |
+
|
221 |
+
# Create output tensors. You need pb_utils.Tensor
|
222 |
+
# objects to create pb_utils.InferenceResponse.
|
223 |
+
input_id_tensor = pb_utils.Tensor(
|
224 |
+
'INPUT_ID', input_id.astype(self.input_id_dtype))
|
225 |
+
request_input_len_tensor = pb_utils.Tensor(
|
226 |
+
'REQUEST_INPUT_LEN',
|
227 |
+
request_input_len.astype(self.request_input_len_dtype))
|
228 |
+
decoder_input_id_tensor = pb_utils.Tensor(
|
229 |
+
'DECODER_INPUT_ID',
|
230 |
+
decoder_input_id.astype(self.decoder_input_id_dtype))
|
231 |
+
request_decoder_input_len_tensor = pb_utils.Tensor(
|
232 |
+
'REQUEST_DECODER_INPUT_LEN',
|
233 |
+
request_decoder_input_len.astype(
|
234 |
+
self.request_decoder_input_len_dtype))
|
235 |
+
request_output_len_tensor = pb_utils.Tensor(
|
236 |
+
'REQUEST_OUTPUT_LEN', request_output_len)
|
237 |
+
bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
|
238 |
+
stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
|
239 |
+
stop_words)
|
240 |
+
embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
|
241 |
+
embedding_bias)
|
242 |
+
end_id_tensor = pb_utils.Tensor('OUT_END_ID',
|
243 |
+
np.array(end_id, dtype=np.int32))
|
244 |
+
pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
|
245 |
+
np.array(pad_id, dtype=np.int32))
|
246 |
+
|
247 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
248 |
+
input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
|
249 |
+
stop_words_ids_tensor, request_input_len_tensor,
|
250 |
+
request_decoder_input_len_tensor, request_output_len_tensor,
|
251 |
+
embedding_bias_tensor, end_id_tensor, pad_id_tensor
|
252 |
+
])
|
253 |
+
responses.append(inference_response)
|
254 |
+
|
255 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
256 |
+
# of this list must match the length of `requests` list.
|
257 |
+
return responses
|
258 |
+
|
259 |
+
def finalize(self):
|
260 |
+
"""`finalize` is called only once when the model is being unloaded.
|
261 |
+
Implementing `finalize` function is optional. This function allows
|
262 |
+
the model to perform any necessary clean ups before exit.
|
263 |
+
"""
|
264 |
+
print('Cleaning up...')
|
265 |
+
|
266 |
+
def _create_request(self, query):
|
267 |
+
"""
|
268 |
+
query : batch string (2D numpy array)
|
269 |
+
"""
|
270 |
+
if isinstance(self.tokenizer, T5Tokenizer):
|
271 |
+
start_ids = [
|
272 |
+
np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
|
273 |
+
s[0].decode(), add_special_tokens=self.add_special_tokens)
|
274 |
+
).astype(int) for s in query
|
275 |
+
]
|
276 |
+
else:
|
277 |
+
start_ids = [
|
278 |
+
np.array(
|
279 |
+
self.tokenizer.encode(
|
280 |
+
s[0].decode(),
|
281 |
+
add_special_tokens=self.add_special_tokens)).astype(
|
282 |
+
int) for s in query
|
283 |
+
]
|
284 |
+
start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
|
285 |
+
|
286 |
+
max_len = 0
|
287 |
+
for seq in start_ids:
|
288 |
+
max_len = max(max_len, seq.shape[0])
|
289 |
+
start_ids = np.stack([
|
290 |
+
np.pad(seq, (0, max_len - seq.shape[0]),
|
291 |
+
'constant',
|
292 |
+
constant_values=(0, self.tokenizer_pad_id))
|
293 |
+
for seq in start_ids
|
294 |
+
])
|
295 |
+
|
296 |
+
return start_ids, start_lengths
|
297 |
+
|
298 |
+
def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
|
299 |
+
'''
|
300 |
+
word_lists format:
|
301 |
+
len(word_lists) == batch_size
|
302 |
+
word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
|
303 |
+
'''
|
304 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
305 |
+
|
306 |
+
if word_lists is None:
|
307 |
+
# Return an empty array of shape (1,2,0)
|
308 |
+
return np.empty([1, 2, 0], dtype="int32")
|
309 |
+
|
310 |
+
flat_ids = []
|
311 |
+
offsets = []
|
312 |
+
for word_list in word_lists:
|
313 |
+
item_flat_ids = []
|
314 |
+
item_offsets = []
|
315 |
+
|
316 |
+
for word in word_list:
|
317 |
+
if isinstance(word, bytes):
|
318 |
+
word = word.decode()
|
319 |
+
|
320 |
+
ids = self.tokenizer.encode(word, add_special_tokens=False)
|
321 |
+
if len(ids) == 0:
|
322 |
+
continue
|
323 |
+
|
324 |
+
item_flat_ids += ids
|
325 |
+
item_offsets.append(len(ids))
|
326 |
+
|
327 |
+
flat_ids.append(np.array(item_flat_ids))
|
328 |
+
offsets.append(np.cumsum(np.array(item_offsets)))
|
329 |
+
|
330 |
+
pad_to = max(1, max(len(ids) for ids in flat_ids))
|
331 |
+
|
332 |
+
for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
|
333 |
+
flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
|
334 |
+
constant_values=0)
|
335 |
+
offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
|
336 |
+
constant_values=-1)
|
337 |
+
|
338 |
+
return np.array([flat_ids, offsets], dtype="int32").transpose(
|
339 |
+
(1, 0, 2))
|
340 |
+
|
341 |
+
def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
|
342 |
+
bias_dtype):
|
343 |
+
|
344 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
345 |
+
|
346 |
+
if embedding_bias_words is None or embedding_bias_weights is None:
|
347 |
+
return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
|
348 |
+
|
349 |
+
batch_embedding_bias = []
|
350 |
+
for words, weights in zip(embedding_bias_words,
|
351 |
+
embedding_bias_weights):
|
352 |
+
|
353 |
+
vocab_size = self.tokenizer.vocab_size
|
354 |
+
embedding_bias = [0.] * vocab_size
|
355 |
+
|
356 |
+
assert len(words) == len(
|
357 |
+
weights
|
358 |
+
), "Embedding bias words must have same dimension as embedding bias weights"
|
359 |
+
|
360 |
+
for word, weight in zip(words, weights):
|
361 |
+
if isinstance(word, bytes):
|
362 |
+
word = word.decode()
|
363 |
+
ids = self.tokenizer.encode(word)
|
364 |
+
|
365 |
+
if len(ids) == 0:
|
366 |
+
continue
|
367 |
+
|
368 |
+
for id in ids:
|
369 |
+
embedding_bias[id] += weight
|
370 |
+
|
371 |
+
batch_embedding_bias.append(np.array(embedding_bias))
|
372 |
+
|
373 |
+
return np.array(batch_embedding_bias, dtype=bias_dtype)
|
preprocessing/1/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
preprocessing/1/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessing/1/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
|
3 |
+
size 493443
|
preprocessing/1/tokenizer_config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": true
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": true
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": true
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"additional_special_tokens": [],
|
31 |
+
"bos_token": "<s>",
|
32 |
+
"clean_up_tokenization_spaces": false,
|
33 |
+
"eos_token": "</s>",
|
34 |
+
"legacy": true,
|
35 |
+
"model_max_length": 1000000000000000019884624838656,
|
36 |
+
"pad_token": null,
|
37 |
+
"sp_model_kwargs": {},
|
38 |
+
"spaces_between_special_tokens": false,
|
39 |
+
"tokenizer_class": "LlamaTokenizer",
|
40 |
+
"unk_token": "<unk>",
|
41 |
+
"use_default_system_prompt": false,
|
42 |
+
"chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
43 |
+
}
|
preprocessing/config.pbtxt
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "preprocessing"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 16
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "QUERY"
|
33 |
+
data_type: TYPE_STRING
|
34 |
+
dims: [ -1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "DECODER_QUERY"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ -1 ]
|
40 |
+
optional: true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
name: "REQUEST_OUTPUT_LEN"
|
44 |
+
data_type: TYPE_INT32
|
45 |
+
dims: [ -1 ]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "BAD_WORDS_DICT"
|
49 |
+
data_type: TYPE_STRING
|
50 |
+
dims: [ -1 ]
|
51 |
+
optional: true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "STOP_WORDS_DICT"
|
55 |
+
data_type: TYPE_STRING
|
56 |
+
dims: [ -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "EMBEDDING_BIAS_WORDS"
|
61 |
+
data_type: TYPE_STRING
|
62 |
+
dims: [ -1 ]
|
63 |
+
optional: true
|
64 |
+
},
|
65 |
+
{
|
66 |
+
name: "EMBEDDING_BIAS_WEIGHTS"
|
67 |
+
data_type: TYPE_FP32
|
68 |
+
dims: [ -1 ]
|
69 |
+
optional: true
|
70 |
+
},
|
71 |
+
{
|
72 |
+
name: "END_ID"
|
73 |
+
data_type: TYPE_INT32
|
74 |
+
dims: [ -1 ]
|
75 |
+
optional: true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
name: "PAD_ID"
|
79 |
+
data_type: TYPE_INT32
|
80 |
+
dims: [ -1 ]
|
81 |
+
optional: true
|
82 |
+
}
|
83 |
+
]
|
84 |
+
output [
|
85 |
+
{
|
86 |
+
name: "INPUT_ID"
|
87 |
+
data_type: TYPE_INT32
|
88 |
+
dims: [ -1 ]
|
89 |
+
},
|
90 |
+
{
|
91 |
+
name: "REQUEST_INPUT_LEN"
|
92 |
+
data_type: TYPE_INT32
|
93 |
+
dims: [ 1 ]
|
94 |
+
},
|
95 |
+
{
|
96 |
+
name: "DECODER_INPUT_ID"
|
97 |
+
data_type: TYPE_INT32
|
98 |
+
dims: [ -1 ]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
name: "REQUEST_DECODER_INPUT_LEN"
|
102 |
+
data_type: TYPE_INT32
|
103 |
+
dims: [ 1 ]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
name: "BAD_WORDS_IDS"
|
107 |
+
data_type: TYPE_INT32
|
108 |
+
dims: [ 2, -1 ]
|
109 |
+
},
|
110 |
+
{
|
111 |
+
name: "STOP_WORDS_IDS"
|
112 |
+
data_type: TYPE_INT32
|
113 |
+
dims: [ 2, -1 ]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
name: "EMBEDDING_BIAS"
|
117 |
+
data_type: TYPE_FP32
|
118 |
+
dims: [ -1 ]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
name: "REQUEST_OUTPUT_LEN"
|
122 |
+
data_type: TYPE_INT32
|
123 |
+
dims: [ -1 ]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
name: "OUT_END_ID"
|
127 |
+
data_type: TYPE_INT32
|
128 |
+
dims: [ -1 ]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
name: "OUT_PAD_ID"
|
132 |
+
data_type: TYPE_INT32
|
133 |
+
dims: [ -1 ]
|
134 |
+
}
|
135 |
+
]
|
136 |
+
|
137 |
+
parameters {
|
138 |
+
key: "tokenizer_dir"
|
139 |
+
value: {
|
140 |
+
string_value: "/all_models/inflight_batcher_llm/postprocessing/1"
|
141 |
+
}
|
142 |
+
}
|
143 |
+
|
144 |
+
parameters {
|
145 |
+
key: "add_special_tokens"
|
146 |
+
value: {
|
147 |
+
string_value: "${add_special_tokens}"
|
148 |
+
}
|
149 |
+
}
|
150 |
+
|
151 |
+
instance_group [
|
152 |
+
{
|
153 |
+
count: 1
|
154 |
+
kind: KIND_CPU
|
155 |
+
}
|
156 |
+
]
|
tensorrt_llm/1/.gitkeep
ADDED
File without changes
|
tensorrt_llm/1/config.json
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "0.11.0.dev2024062500",
|
3 |
+
"pretrained_config": {
|
4 |
+
"mlp_bias": false,
|
5 |
+
"attn_bias": false,
|
6 |
+
"rotary_base": 1000000.0,
|
7 |
+
"rotary_scaling": null,
|
8 |
+
"residual_mlp": false,
|
9 |
+
"disable_weight_only_quant_plugin": false,
|
10 |
+
"moe": {
|
11 |
+
"num_experts": 8,
|
12 |
+
"top_k": 2,
|
13 |
+
"normalization_mode": 1
|
14 |
+
},
|
15 |
+
"architecture": "LlamaForCausalLM",
|
16 |
+
"dtype": "float16",
|
17 |
+
"vocab_size": 32000,
|
18 |
+
"hidden_size": 4096,
|
19 |
+
"num_hidden_layers": 32,
|
20 |
+
"num_attention_heads": 32,
|
21 |
+
"hidden_act": "swiglu",
|
22 |
+
"logits_dtype": "float32",
|
23 |
+
"norm_epsilon": 1e-05,
|
24 |
+
"position_embedding_type": "rope_gpt_neox",
|
25 |
+
"max_position_embeddings": 32768,
|
26 |
+
"num_key_value_heads": 8,
|
27 |
+
"intermediate_size": 14336,
|
28 |
+
"mapping": {
|
29 |
+
"world_size": 2,
|
30 |
+
"gpus_per_node": 8,
|
31 |
+
"tp_size": 1,
|
32 |
+
"pp_size": 2,
|
33 |
+
"moe_tp_size": 1,
|
34 |
+
"moe_ep_size": 1
|
35 |
+
},
|
36 |
+
"quantization": {
|
37 |
+
"quant_algo": null,
|
38 |
+
"kv_cache_quant_algo": null,
|
39 |
+
"group_size": 128,
|
40 |
+
"smoothquant_val": null,
|
41 |
+
"has_zero_point": false,
|
42 |
+
"pre_quant_scale": false,
|
43 |
+
"exclude_modules": null
|
44 |
+
},
|
45 |
+
"use_parallel_embedding": false,
|
46 |
+
"embedding_sharding_dim": 0,
|
47 |
+
"share_embedding_table": false,
|
48 |
+
"head_size": 128,
|
49 |
+
"qk_layernorm": false
|
50 |
+
},
|
51 |
+
"build_config": {
|
52 |
+
"max_input_len": 28000,
|
53 |
+
"max_seq_len": 32500,
|
54 |
+
"opt_batch_size": null,
|
55 |
+
"max_batch_size": 16,
|
56 |
+
"max_beam_width": 1,
|
57 |
+
"max_num_tokens": 8192,
|
58 |
+
"opt_num_tokens": 16,
|
59 |
+
"max_prompt_embedding_table_size": 0,
|
60 |
+
"gather_context_logits": false,
|
61 |
+
"gather_generation_logits": false,
|
62 |
+
"strongly_typed": true,
|
63 |
+
"builder_opt": null,
|
64 |
+
"profiling_verbosity": "layer_names_only",
|
65 |
+
"enable_debug_output": false,
|
66 |
+
"max_draft_len": 0,
|
67 |
+
"speculative_decoding_mode": 1,
|
68 |
+
"use_refit": false,
|
69 |
+
"input_timing_cache": null,
|
70 |
+
"output_timing_cache": "model.cache",
|
71 |
+
"lora_config": {
|
72 |
+
"lora_dir": [],
|
73 |
+
"lora_ckpt_source": "hf",
|
74 |
+
"max_lora_rank": 64,
|
75 |
+
"lora_target_modules": [],
|
76 |
+
"trtllm_modules_to_hf_modules": {}
|
77 |
+
},
|
78 |
+
"auto_parallel_config": {
|
79 |
+
"world_size": 1,
|
80 |
+
"gpus_per_node": 8,
|
81 |
+
"cluster_key": "A100-SXM-80GB",
|
82 |
+
"cluster_info": null,
|
83 |
+
"sharding_cost_model": "alpha_beta",
|
84 |
+
"comm_cost_model": "alpha_beta",
|
85 |
+
"enable_pipeline_parallelism": false,
|
86 |
+
"enable_shard_unbalanced_shape": false,
|
87 |
+
"enable_shard_dynamic_shape": false,
|
88 |
+
"enable_reduce_scatter": true,
|
89 |
+
"builder_flags": null,
|
90 |
+
"debug_mode": false,
|
91 |
+
"infer_shape": true,
|
92 |
+
"validation_mode": false,
|
93 |
+
"same_buffer_io": {
|
94 |
+
"past_key_value_(\\d+)": "present_key_value_\\1"
|
95 |
+
},
|
96 |
+
"same_spec_io": {},
|
97 |
+
"sharded_io_allowlist": [
|
98 |
+
"past_key_value_\\d+",
|
99 |
+
"present_key_value_\\d*"
|
100 |
+
],
|
101 |
+
"fill_weights": false,
|
102 |
+
"parallel_config_cache": null,
|
103 |
+
"profile_cache": null,
|
104 |
+
"dump_path": null,
|
105 |
+
"debug_outputs": []
|
106 |
+
},
|
107 |
+
"weight_sparsity": false,
|
108 |
+
"weight_streaming": false,
|
109 |
+
"plugin_config": {
|
110 |
+
"dtype": "float16",
|
111 |
+
"bert_attention_plugin": "auto",
|
112 |
+
"gpt_attention_plugin": "auto",
|
113 |
+
"gemm_plugin": "float16",
|
114 |
+
"gemm_swiglu_plugin": null,
|
115 |
+
"smooth_quant_gemm_plugin": null,
|
116 |
+
"identity_plugin": null,
|
117 |
+
"layernorm_quantization_plugin": null,
|
118 |
+
"rmsnorm_quantization_plugin": null,
|
119 |
+
"nccl_plugin": "float16",
|
120 |
+
"lookup_plugin": null,
|
121 |
+
"lora_plugin": null,
|
122 |
+
"weight_only_groupwise_quant_matmul_plugin": null,
|
123 |
+
"weight_only_quant_matmul_plugin": null,
|
124 |
+
"quantize_per_token_plugin": false,
|
125 |
+
"quantize_tensor_plugin": false,
|
126 |
+
"moe_plugin": "auto",
|
127 |
+
"mamba_conv1d_plugin": "auto",
|
128 |
+
"context_fmha": true,
|
129 |
+
"context_fmha_fp32_acc": false,
|
130 |
+
"paged_kv_cache": true,
|
131 |
+
"remove_input_padding": true,
|
132 |
+
"use_custom_all_reduce": true,
|
133 |
+
"reduce_fusion": false,
|
134 |
+
"multi_block_mode": false,
|
135 |
+
"enable_xqa": true,
|
136 |
+
"attention_qk_half_accumulation": false,
|
137 |
+
"tokens_per_block": 64,
|
138 |
+
"use_paged_context_fmha": false,
|
139 |
+
"use_fp8_context_fmha": false,
|
140 |
+
"multiple_profiles": false,
|
141 |
+
"paged_state": true,
|
142 |
+
"streamingllm": false
|
143 |
+
},
|
144 |
+
"use_strip_plan": false,
|
145 |
+
"max_encoder_input_len": 1024,
|
146 |
+
"use_fused_mlp": false
|
147 |
+
}
|
148 |
+
}
|
tensorrt_llm/1/model.py
ADDED
@@ -0,0 +1,782 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
from threading import Lock, Thread
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import triton_python_backend_utils as pb_utils
|
9 |
+
from torch import from_numpy
|
10 |
+
|
11 |
+
import tensorrt_llm.bindings.executor as trtllm
|
12 |
+
|
13 |
+
|
14 |
+
def get_input_tensor_by_name(request, name):
|
15 |
+
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
16 |
+
if tensor is None:
|
17 |
+
return None
|
18 |
+
return tensor.as_numpy()
|
19 |
+
|
20 |
+
|
21 |
+
def get_input_scalar_by_name(request, name):
|
22 |
+
tensor = get_input_tensor_by_name(request, name)
|
23 |
+
if tensor is None:
|
24 |
+
return None
|
25 |
+
if tensor.size != 1:
|
26 |
+
raise pb_utils.TritonModelException(
|
27 |
+
f"Expected a single value for {name}")
|
28 |
+
return tensor.item()
|
29 |
+
|
30 |
+
|
31 |
+
def read_parameter_as_type(value, name, pytype=str):
|
32 |
+
if value == "":
|
33 |
+
return None
|
34 |
+
if value.startswith("${") and value.endswith("}"):
|
35 |
+
return None
|
36 |
+
if pytype is bool:
|
37 |
+
return value.lower() in ["1", "true"]
|
38 |
+
try:
|
39 |
+
result = pytype(value)
|
40 |
+
return result
|
41 |
+
except:
|
42 |
+
pb_utils.Logger.log_warning(
|
43 |
+
f"Could not read parameter '{name}' with value '{value}', will use default."
|
44 |
+
)
|
45 |
+
return None
|
46 |
+
|
47 |
+
|
48 |
+
def get_parameter(model_config, name, pytype=str):
|
49 |
+
if name not in model_config['parameters']:
|
50 |
+
return None
|
51 |
+
return read_parameter_as_type(
|
52 |
+
model_config['parameters'][name]['string_value'], name, pytype)
|
53 |
+
|
54 |
+
|
55 |
+
def convert_word_list(word_list):
|
56 |
+
if word_list is None:
|
57 |
+
return None
|
58 |
+
word_list = word_list.tolist()
|
59 |
+
if len(word_list) == 0 or len(word_list[0]) != 2:
|
60 |
+
raise pb_utils.TritonModelException(f"Invalid format for word list.")
|
61 |
+
words, indices = word_list[0]
|
62 |
+
result = []
|
63 |
+
current_index = 0
|
64 |
+
for i in indices:
|
65 |
+
if i == -1:
|
66 |
+
continue
|
67 |
+
if i > len(words):
|
68 |
+
raise pb_utils.TritonModelException(
|
69 |
+
f"Invalid format for word list.")
|
70 |
+
current_word = []
|
71 |
+
while current_index < i:
|
72 |
+
current_word.append(words[current_index])
|
73 |
+
current_index += 1
|
74 |
+
result.append(current_word)
|
75 |
+
return result
|
76 |
+
|
77 |
+
|
78 |
+
def parse_medusa_choices(medusa_choices):
|
79 |
+
if medusa_choices is None:
|
80 |
+
return None
|
81 |
+
try:
|
82 |
+
result = json.loads(
|
83 |
+
"[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
|
84 |
+
assert isinstance(result, list) and len(result) > 0
|
85 |
+
assert all([isinstance(x, list) for x in result])
|
86 |
+
assert all([isinstance(y, int) for x in result for y in x])
|
87 |
+
except Exception:
|
88 |
+
raise pb_utils.TritonModelException(
|
89 |
+
"Invalid format for medusa_choices")
|
90 |
+
return result
|
91 |
+
|
92 |
+
|
93 |
+
def get_sampling_config_from_request(request):
|
94 |
+
kwargs = {}
|
95 |
+
kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1
|
96 |
+
kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k')
|
97 |
+
kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p')
|
98 |
+
kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
|
99 |
+
'top_p'] <= 0 else kwargs['top_p']
|
100 |
+
kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed')
|
101 |
+
kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature')
|
102 |
+
kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length')
|
103 |
+
kwargs['repetition_penalty'] = get_input_scalar_by_name(
|
104 |
+
request, 'repetition_penalty')
|
105 |
+
kwargs['presence_penalty'] = get_input_scalar_by_name(
|
106 |
+
request, 'presence_penalty')
|
107 |
+
kwargs['frequency_penalty'] = get_input_scalar_by_name(
|
108 |
+
request, 'frequency_penalty')
|
109 |
+
kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty')
|
110 |
+
kwargs['top_p_min'] = get_input_scalar_by_name(request,
|
111 |
+
'runtime_top_p_min')
|
112 |
+
kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
|
113 |
+
request, 'runtime_top_p_reset_ids')
|
114 |
+
kwargs['top_p_decay'] = get_input_scalar_by_name(request,
|
115 |
+
'runtime_top_p_decay')
|
116 |
+
kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
|
117 |
+
request, 'beam_search_diversity_rate')
|
118 |
+
kwargs['early_stopping'] = get_input_scalar_by_name(
|
119 |
+
request, 'early_stopping')
|
120 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
121 |
+
return trtllm.SamplingConfig(**kwargs)
|
122 |
+
|
123 |
+
|
124 |
+
def get_output_config_from_request(request, exclude_input_from_output):
|
125 |
+
kwargs = {}
|
126 |
+
kwargs["return_log_probs"] = get_input_scalar_by_name(
|
127 |
+
request, 'return_log_probs')
|
128 |
+
kwargs["return_context_logits"] = get_input_scalar_by_name(
|
129 |
+
request, 'return_context_logits')
|
130 |
+
kwargs["return_generation_logits"] = get_input_scalar_by_name(
|
131 |
+
request, 'return_generation_logits')
|
132 |
+
kwargs["exclude_input_from_output"] = exclude_input_from_output
|
133 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
134 |
+
return trtllm.OutputConfig(**kwargs)
|
135 |
+
|
136 |
+
|
137 |
+
def get_external_draft_tokens_config_from_request(request):
|
138 |
+
kwargs = {}
|
139 |
+
draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids')
|
140 |
+
if draft_input_ids is not None:
|
141 |
+
kwargs['tokens'] = draft_input_ids.tolist()
|
142 |
+
draft_logits = get_input_tensor_by_name(request, 'draft_logits')
|
143 |
+
if draft_logits is not None:
|
144 |
+
kwargs['logits'] = from_numpy(draft_logits)
|
145 |
+
kwargs['acceptance_threshold'] = get_input_scalar_by_name(
|
146 |
+
request, 'draft_acceptance_threshold')
|
147 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
148 |
+
if len(kwargs) > 0:
|
149 |
+
return trtllm.ExternalDraftTokensConfig(**kwargs)
|
150 |
+
return None
|
151 |
+
|
152 |
+
|
153 |
+
def get_prompt_tuning_config_from_request(request):
|
154 |
+
# prompt_vocab_size is unused by executor.
|
155 |
+
kwargs = {}
|
156 |
+
prompt_embedding_table = get_input_tensor_by_name(
|
157 |
+
request, 'prompt_embedding_table')
|
158 |
+
if prompt_embedding_table is not None:
|
159 |
+
kwargs["embedding_table"] = from_numpy(prompt_embedding_table)
|
160 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
161 |
+
if len(kwargs) > 0:
|
162 |
+
return trtllm.PromptTuningConfig(**kwargs)
|
163 |
+
return None
|
164 |
+
|
165 |
+
|
166 |
+
def get_lora_config_from_request(request):
|
167 |
+
kwargs = {}
|
168 |
+
kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id')
|
169 |
+
lora_weights = get_input_tensor_by_name(request, 'lora_weights')
|
170 |
+
if lora_weights is not None:
|
171 |
+
kwargs["weights"] = from_numpy(lora_weights)
|
172 |
+
lora_config = get_input_tensor_by_name(request, 'lora_config')
|
173 |
+
if lora_config is not None:
|
174 |
+
kwargs["config"] = from_numpy(lora_config)
|
175 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
176 |
+
if len(kwargs) > 0:
|
177 |
+
return trtllm.LoraConfig(**kwargs)
|
178 |
+
return None
|
179 |
+
|
180 |
+
|
181 |
+
def convert_request(request, exclude_input_from_output, decoupled):
|
182 |
+
inputs = {}
|
183 |
+
input_token_ids = get_input_tensor_by_name(request, 'input_ids')
|
184 |
+
if input_token_ids is None:
|
185 |
+
raise pb_utils.TritonModelException(
|
186 |
+
"A value is required for input_ids")
|
187 |
+
input_token_ids = input_token_ids.tolist()
|
188 |
+
if len(input_token_ids) == 0:
|
189 |
+
raise pb_utils.TritonModelException(f"Invalid format for input_ids")
|
190 |
+
inputs['input_token_ids'] = input_token_ids[0]
|
191 |
+
# input_lengths is not not used by executor.
|
192 |
+
inputs['max_new_tokens'] = get_input_scalar_by_name(
|
193 |
+
request, 'request_output_len')
|
194 |
+
if inputs['max_new_tokens'] is None:
|
195 |
+
raise pb_utils.TritonModelException(
|
196 |
+
"A value is required for request_output_len")
|
197 |
+
inputs['streaming'] = get_input_scalar_by_name(request, 'streaming')
|
198 |
+
if inputs['streaming'] and not decoupled:
|
199 |
+
raise pb_utils.TritonModelException(
|
200 |
+
"Streaming is only supported in decoupled mode.")
|
201 |
+
inputs['end_id'] = get_input_scalar_by_name(request, 'end_id')
|
202 |
+
inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id')
|
203 |
+
inputs['stop_words'] = convert_word_list(
|
204 |
+
get_input_tensor_by_name(request, 'stop_words_list'))
|
205 |
+
inputs['bad_words'] = convert_word_list(
|
206 |
+
get_input_tensor_by_name(request, 'bad_words_list'))
|
207 |
+
embedding_bias = get_input_tensor_by_name(request, 'embedding_bias')
|
208 |
+
if embedding_bias is not None and embedding_bias.size != 0:
|
209 |
+
inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
|
210 |
+
|
211 |
+
sampling_config = get_sampling_config_from_request(request)
|
212 |
+
output_config = get_output_config_from_request(request,
|
213 |
+
exclude_input_from_output)
|
214 |
+
external_draft_tokens_config = get_external_draft_tokens_config_from_request(
|
215 |
+
request)
|
216 |
+
prompt_tuning_config = get_prompt_tuning_config_from_request(request)
|
217 |
+
lora_config = get_lora_config_from_request(request)
|
218 |
+
|
219 |
+
return trtllm.Request(
|
220 |
+
**inputs,
|
221 |
+
sampling_config=sampling_config,
|
222 |
+
output_config=output_config,
|
223 |
+
external_draft_tokens_config=external_draft_tokens_config,
|
224 |
+
prompt_tuning_config=prompt_tuning_config,
|
225 |
+
lora_config=lora_config,
|
226 |
+
)
|
227 |
+
|
228 |
+
|
229 |
+
def convert_response(response):
|
230 |
+
if response.has_error():
|
231 |
+
return pb_utils.InferenceResponse(output_tensors=[],
|
232 |
+
error=pb_utils.TritonError(
|
233 |
+
response.error_msg)), True
|
234 |
+
result = response.result
|
235 |
+
beam_lengths = np.expand_dims(
|
236 |
+
np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
|
237 |
+
max_beam_length = max([len(beam) for beam in result.output_token_ids])
|
238 |
+
output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
|
239 |
+
-1, np.int32)
|
240 |
+
for idx, beam in enumerate(result.output_token_ids):
|
241 |
+
output_ids[0, idx, :len(beam)] = beam
|
242 |
+
output_tensors = [
|
243 |
+
pb_utils.Tensor("output_ids", output_ids),
|
244 |
+
pb_utils.Tensor("sequence_length", beam_lengths),
|
245 |
+
]
|
246 |
+
output_tensors.append(
|
247 |
+
pb_utils.Tensor(
|
248 |
+
"cum_log_probs",
|
249 |
+
np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
|
250 |
+
if result.cum_log_probs is not None else np.zeros(
|
251 |
+
(1, 1), np.float32)))
|
252 |
+
output_tensors.append(
|
253 |
+
pb_utils.Tensor(
|
254 |
+
"output_log_probs",
|
255 |
+
np.expand_dims(np.array(result.log_probs, np.float32), 0) if
|
256 |
+
result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
|
257 |
+
output_tensors.append(
|
258 |
+
pb_utils.Tensor(
|
259 |
+
"context_logits",
|
260 |
+
np.expand_dims(np.array(result.context_logits, np.float32), 0)
|
261 |
+
if result.context_logits is not None else np.zeros(
|
262 |
+
(1, 1, 1), np.float32)))
|
263 |
+
output_tensors.append(
|
264 |
+
pb_utils.Tensor(
|
265 |
+
"generation_logits",
|
266 |
+
np.expand_dims(np.array(result.generation_logits, np.float32), 0)
|
267 |
+
if result.generation_logits is not None else np.zeros(
|
268 |
+
(1, 1, 1, 1), np.float32)))
|
269 |
+
return pb_utils.InferenceResponse(output_tensors), result.is_final
|
270 |
+
|
271 |
+
|
272 |
+
def convert_scheduler_policy(batch_scheduler_policy: str):
|
273 |
+
if batch_scheduler_policy.lower() == "max_utilization":
|
274 |
+
return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
|
275 |
+
elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
|
276 |
+
return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
|
277 |
+
raise pb_utils.TritonModelException(
|
278 |
+
f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
|
279 |
+
)
|
280 |
+
|
281 |
+
|
282 |
+
def convert_batching_type(gpt_model_type: str):
|
283 |
+
if gpt_model_type is None:
|
284 |
+
return None
|
285 |
+
if gpt_model_type.lower(
|
286 |
+
) == "inflight_fused_batching" or gpt_model_type.lower(
|
287 |
+
) == "inflight_batching":
|
288 |
+
return trtllm.BatchingType.INFLIGHT
|
289 |
+
elif gpt_model_type.lower() == "v1":
|
290 |
+
return trtllm.BatchingType.STATIC
|
291 |
+
raise pb_utils.TritonModelException(
|
292 |
+
f"gpt_model_type value of '{gpt_model_type}' is not supported.")
|
293 |
+
|
294 |
+
|
295 |
+
def convert_decoding_mode(decoding_mode: str):
|
296 |
+
if decoding_mode is None:
|
297 |
+
return None
|
298 |
+
elif decoding_mode == "auto":
|
299 |
+
return trtllm.DecodingMode.Auto()
|
300 |
+
elif decoding_mode == "top_k":
|
301 |
+
return trtllm.DecodingMode.TopK()
|
302 |
+
elif decoding_mode == "top_p":
|
303 |
+
return trtllm.DecodingMode.TopP()
|
304 |
+
elif decoding_mode == "top_k_top_p":
|
305 |
+
return trtllm.DecodingMode.TopKTopP()
|
306 |
+
elif decoding_mode == "beam_search":
|
307 |
+
return trtllm.DecodingMode.BeamSearch()
|
308 |
+
elif decoding_mode == "medusa":
|
309 |
+
return trtllm.DecodingMode.Medusa()
|
310 |
+
raise pb_utils.TritonModelException(
|
311 |
+
f"decoding_mode value of '{decoding_mode}' is not supported.")
|
312 |
+
|
313 |
+
|
314 |
+
def convert_timestamp_to_seconds(timestamp: str):
|
315 |
+
return int(
|
316 |
+
datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp())
|
317 |
+
|
318 |
+
|
319 |
+
class TritonPythonModel:
|
320 |
+
"""Your Python model must use the same class name. Every Python model
|
321 |
+
that is created must have "TritonPythonModel" as the class name.
|
322 |
+
"""
|
323 |
+
|
324 |
+
def get_scheduler_config(self, model_config):
|
325 |
+
batch_scheduler_policy = get_parameter(model_config,
|
326 |
+
"batch_scheduler_policy")
|
327 |
+
if batch_scheduler_policy is None:
|
328 |
+
return trtllm.SchedulerConfig()
|
329 |
+
return trtllm.SchedulerConfig(
|
330 |
+
convert_scheduler_policy(batch_scheduler_policy))
|
331 |
+
|
332 |
+
def get_kv_cache_config(self, model_config):
|
333 |
+
kwargs = {
|
334 |
+
"enable_block_reuse":
|
335 |
+
get_parameter(model_config, "enable_kv_cache_reuse", bool),
|
336 |
+
"max_tokens":
|
337 |
+
get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
|
338 |
+
"sink_token_length":
|
339 |
+
get_parameter(model_config, "sink_token_length", int),
|
340 |
+
"max_attention_window":
|
341 |
+
get_parameter(model_config, "max_attention_window_size", int),
|
342 |
+
"free_gpu_memory_fraction":
|
343 |
+
get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
|
344 |
+
float),
|
345 |
+
"host_cache_size":
|
346 |
+
get_parameter(model_config, "kv_cache_host_memory_bytes", int),
|
347 |
+
"onboard_blocks":
|
348 |
+
get_parameter(model_config, "kv_cache_onboard_blocks", bool),
|
349 |
+
}
|
350 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
351 |
+
return trtllm.KvCacheConfig(**kwargs)
|
352 |
+
|
353 |
+
def get_parallel_config(self, model_config):
|
354 |
+
kwargs = {}
|
355 |
+
gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
|
356 |
+
if gpu_device_ids:
|
357 |
+
kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
|
358 |
+
self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
|
359 |
+
"0") == "1"
|
360 |
+
if self.use_orchestrator_mode:
|
361 |
+
kwargs[
|
362 |
+
"communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
|
363 |
+
worker_path = get_parameter(model_config, "worker_path")
|
364 |
+
if worker_path is not None:
|
365 |
+
raise pb_utils.TritonModelException(
|
366 |
+
"worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
|
367 |
+
)
|
368 |
+
executor_worker_path = get_parameter(model_config,
|
369 |
+
"executor_worker_path")
|
370 |
+
kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
|
371 |
+
True, executor_worker_path)
|
372 |
+
if len(kwargs) > 0:
|
373 |
+
return trtllm.ParallelConfig(**kwargs)
|
374 |
+
return None
|
375 |
+
|
376 |
+
def get_peft_cache_config(self, model_config):
|
377 |
+
kwargs = {
|
378 |
+
"optimal_adapter_size":
|
379 |
+
get_parameter(model_config, "lora_cache_optimal_adapter_size",
|
380 |
+
int),
|
381 |
+
"max_adapter_size":
|
382 |
+
get_parameter(model_config, "lora_cache_max_adapter_size", int),
|
383 |
+
"device_cache_percent":
|
384 |
+
get_parameter(model_config, "lora_cache_gpu_memory_fraction",
|
385 |
+
float),
|
386 |
+
"host_cache_size":
|
387 |
+
get_parameter(model_config, "lora_cache_host_memory_bytes", int),
|
388 |
+
}
|
389 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
390 |
+
return trtllm.PeftCacheConfig(**kwargs)
|
391 |
+
|
392 |
+
def get_decoding_config(self, model_config):
|
393 |
+
kwargs = {
|
394 |
+
"medusa_choices":
|
395 |
+
parse_medusa_choices(get_parameter(model_config,
|
396 |
+
"medusa_choices")),
|
397 |
+
"decoding_mode":
|
398 |
+
convert_decoding_mode(get_parameter(model_config,
|
399 |
+
"decoding_mode")),
|
400 |
+
}
|
401 |
+
print(kwargs)
|
402 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
403 |
+
return trtllm.DecodingConfig(**kwargs)
|
404 |
+
|
405 |
+
def get_executor_config(self, model_config):
|
406 |
+
kwargs = {
|
407 |
+
"max_beam_width":
|
408 |
+
get_parameter(model_config, "max_beam_width", int),
|
409 |
+
"scheduler_config":
|
410 |
+
self.get_scheduler_config(model_config),
|
411 |
+
"kv_cache_config":
|
412 |
+
self.get_kv_cache_config(model_config),
|
413 |
+
"enable_chunked_context":
|
414 |
+
get_parameter(model_config, "enable_chunked_context", bool),
|
415 |
+
"normalize_log_probs":
|
416 |
+
get_parameter(model_config, "normalize_log_probs", bool),
|
417 |
+
"batching_type":
|
418 |
+
convert_batching_type(get_parameter(model_config,
|
419 |
+
"gpt_model_type")),
|
420 |
+
"parallel_config":
|
421 |
+
self.get_parallel_config(model_config),
|
422 |
+
"peft_cache_config":
|
423 |
+
self.get_peft_cache_config(model_config),
|
424 |
+
"decoding_config":
|
425 |
+
self.get_decoding_config(model_config),
|
426 |
+
}
|
427 |
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
428 |
+
return trtllm.ExecutorConfig(**kwargs)
|
429 |
+
|
430 |
+
def create_metrics(self, model: str, version: str, is_v1_model: bool):
|
431 |
+
self.request_metric_family = pb_utils.MetricFamily(
|
432 |
+
name="nv_trt_llm_request_metrics",
|
433 |
+
description="TRT LLM request metrics",
|
434 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
435 |
+
)
|
436 |
+
self.runtime_memory_metric_family = pb_utils.MetricFamily(
|
437 |
+
name="nv_trt_llm_runtime_memory_metrics",
|
438 |
+
description="TRT LLM runtime memory metrics",
|
439 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
440 |
+
)
|
441 |
+
self.kv_cache_metric_family = pb_utils.MetricFamily(
|
442 |
+
name="nv_trt_llm_kv_cache_block_metrics",
|
443 |
+
description="TRT LLM KV cache block metrics",
|
444 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
445 |
+
)
|
446 |
+
model_type = "v1" if is_v1_model else "inflight_batcher"
|
447 |
+
self.model_type_metric_family = pb_utils.MetricFamily(
|
448 |
+
name=f"nv_trt_llm_{model_type}_metrics",
|
449 |
+
description=f"TRT LLM {model_type}-specific metrics",
|
450 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
451 |
+
)
|
452 |
+
self.general_metric_family = pb_utils.MetricFamily(
|
453 |
+
name="nv_trt_llm_general_metrics",
|
454 |
+
description="General TRT LLM metrics",
|
455 |
+
kind=pb_utils.MetricFamily.GAUGE,
|
456 |
+
)
|
457 |
+
common_labels = {"model": model, "version": version}
|
458 |
+
self.all_metrics = {
|
459 |
+
# Request metrics
|
460 |
+
"num_active_requests":
|
461 |
+
self.request_metric_family.Metric(labels={
|
462 |
+
"request_type": "active",
|
463 |
+
**common_labels
|
464 |
+
}),
|
465 |
+
"max_num_active_requests":
|
466 |
+
self.request_metric_family.Metric(labels={
|
467 |
+
"request_type": "max",
|
468 |
+
**common_labels
|
469 |
+
}),
|
470 |
+
"num_scheduled_requests":
|
471 |
+
self.request_metric_family.Metric(labels={
|
472 |
+
"request_type": "scheduled",
|
473 |
+
**common_labels
|
474 |
+
}),
|
475 |
+
"num_context_requests":
|
476 |
+
self.request_metric_family.Metric(labels={
|
477 |
+
"request_type": "context",
|
478 |
+
**common_labels
|
479 |
+
}),
|
480 |
+
# Runtime metrics
|
481 |
+
"cpu_mem_usage":
|
482 |
+
self.runtime_memory_metric_family.Metric(labels={
|
483 |
+
"memory_type": "cpu",
|
484 |
+
**common_labels
|
485 |
+
}),
|
486 |
+
"gpu_mem_usage":
|
487 |
+
self.runtime_memory_metric_family.Metric(labels={
|
488 |
+
"memory_type": "gpu",
|
489 |
+
**common_labels
|
490 |
+
}),
|
491 |
+
"pinned_mem_usage":
|
492 |
+
self.runtime_memory_metric_family.Metric(labels={
|
493 |
+
"memory_type": "pinned",
|
494 |
+
**common_labels
|
495 |
+
}),
|
496 |
+
# KV cache metrics
|
497 |
+
"max_num_blocks":
|
498 |
+
self.kv_cache_metric_family.Metric(labels={
|
499 |
+
"kv_cache_block_type": "max",
|
500 |
+
**common_labels
|
501 |
+
}),
|
502 |
+
"free_num_blocks":
|
503 |
+
self.kv_cache_metric_family.Metric(labels={
|
504 |
+
"kv_cache_block_type": "free",
|
505 |
+
**common_labels
|
506 |
+
}),
|
507 |
+
"used_num_blocks":
|
508 |
+
self.kv_cache_metric_family.Metric(labels={
|
509 |
+
"kv_cache_block_type": "used",
|
510 |
+
**common_labels
|
511 |
+
}),
|
512 |
+
"tokens_per_block":
|
513 |
+
self.kv_cache_metric_family.Metric(labels={
|
514 |
+
"kv_cache_block_type": "tokens_per",
|
515 |
+
**common_labels
|
516 |
+
}),
|
517 |
+
# General metrics
|
518 |
+
"timestamp":
|
519 |
+
self.general_metric_family.Metric(labels={
|
520 |
+
"general_type": "timestamp",
|
521 |
+
**common_labels
|
522 |
+
}),
|
523 |
+
"iter":
|
524 |
+
self.general_metric_family.Metric(labels={
|
525 |
+
"general_type": "iteration_counter",
|
526 |
+
**common_labels
|
527 |
+
}),
|
528 |
+
}
|
529 |
+
if is_v1_model:
|
530 |
+
self.all_metrics.update({
|
531 |
+
"num_ctx_tokens":
|
532 |
+
self.model_type_metric_family.Metric(labels={
|
533 |
+
"v1_specific_metric": "total_context_tokens",
|
534 |
+
**common_labels
|
535 |
+
}),
|
536 |
+
"num_gen_tokens":
|
537 |
+
self.model_type_metric_family.Metric(
|
538 |
+
labels={
|
539 |
+
"v1_specific_metric": "total_generation_tokens",
|
540 |
+
**common_labels
|
541 |
+
}),
|
542 |
+
"empty_gen_slots":
|
543 |
+
self.model_type_metric_family.Metric(
|
544 |
+
labels={
|
545 |
+
"v1_specific_metric": "empty_generation_slots",
|
546 |
+
**common_labels
|
547 |
+
}),
|
548 |
+
})
|
549 |
+
else:
|
550 |
+
self.all_metrics.update({
|
551 |
+
"num_ctx_tokens":
|
552 |
+
self.model_type_metric_family.Metric(
|
553 |
+
labels={
|
554 |
+
"inflight_batcher_specific_metric":
|
555 |
+
"total_context_tokens",
|
556 |
+
**common_labels
|
557 |
+
}),
|
558 |
+
"num_gen_requests":
|
559 |
+
self.model_type_metric_family.Metric(
|
560 |
+
labels={
|
561 |
+
"inflight_batcher_specific_metric":
|
562 |
+
"generation_requests",
|
563 |
+
**common_labels
|
564 |
+
}),
|
565 |
+
"micro_batch_id":
|
566 |
+
self.model_type_metric_family.Metric(
|
567 |
+
labels={
|
568 |
+
"inflight_batcher_specific_metric": "micro_batch_id",
|
569 |
+
**common_labels
|
570 |
+
}),
|
571 |
+
"num_paused_requests":
|
572 |
+
self.model_type_metric_family.Metric(
|
573 |
+
labels={
|
574 |
+
"inflight_batcher_specific_metric": "paused_requests",
|
575 |
+
**common_labels
|
576 |
+
}),
|
577 |
+
})
|
578 |
+
|
579 |
+
def initialize(self, args):
|
580 |
+
"""`initialize` is called only once when the model is being loaded.
|
581 |
+
Implementing `initialize` function is optional. This function allows
|
582 |
+
the model to initialize any state associated with this model.
|
583 |
+
|
584 |
+
Parameters
|
585 |
+
----------
|
586 |
+
args : dict
|
587 |
+
Both keys and values are strings. The dictionary keys and values are:
|
588 |
+
* model_config: A JSON string containing the model configuration
|
589 |
+
* model_instance_kind: A string containing model instance kind
|
590 |
+
* model_instance_device_id: A string containing model instance device ID
|
591 |
+
* model_repository: Model repository path
|
592 |
+
* model_version: Model version
|
593 |
+
* model_name: Model name
|
594 |
+
"""
|
595 |
+
model_config = json.loads(args['model_config'])
|
596 |
+
gpt_model_path = get_parameter(model_config, "gpt_model_path")
|
597 |
+
if get_parameter(model_config, "enable_trt_overlap", bool):
|
598 |
+
raise pb_utils.TritonModelException(
|
599 |
+
f"enable_trt_overlap=true is not supported.")
|
600 |
+
self.exclude_input_from_output = get_parameter(
|
601 |
+
model_config, "exclude_input_in_output", bool)
|
602 |
+
executor_config = self.get_executor_config(model_config)
|
603 |
+
self.executor = trtllm.Executor(gpt_model_path,
|
604 |
+
trtllm.ModelType.DECODER_ONLY,
|
605 |
+
executor_config)
|
606 |
+
self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
|
607 |
+
model_config)
|
608 |
+
self.cancellation_check_period_ms = get_parameter(
|
609 |
+
model_config, "cancellation_check_period_ms", int) or 100
|
610 |
+
self.stats_check_period_ms = get_parameter(
|
611 |
+
model_config, "stats_check_period_ms", int) or 100
|
612 |
+
|
613 |
+
if not self.decoupled:
|
614 |
+
raise pb_utils.TritonModelException(
|
615 |
+
"Please enable decoupled transaction policy in the model configuration to serve this model"
|
616 |
+
)
|
617 |
+
|
618 |
+
self.create_metrics(args["model_name"],
|
619 |
+
args["model_version"],
|
620 |
+
is_v1_model=executor_config.batching_type ==
|
621 |
+
trtllm.BatchingType.STATIC)
|
622 |
+
self.triton_id_to_req_id = {}
|
623 |
+
self.req_id_to_response_sender = {}
|
624 |
+
self.lock = Lock()
|
625 |
+
self.running = False
|
626 |
+
self.awaiter_thread = Thread(target=self.awaiter_loop)
|
627 |
+
self.cancellation_thread = Thread(target=self.cancellation_loop)
|
628 |
+
self.metrics_thread = Thread(target=self.metrics_loop)
|
629 |
+
if self.executor.can_enqueue_requests():
|
630 |
+
self.running = True
|
631 |
+
self.awaiter_thread.start()
|
632 |
+
self.cancellation_thread.start()
|
633 |
+
self.metrics_thread.start()
|
634 |
+
else:
|
635 |
+
# In leader mode, worker ranks will wait here until leader is done.
|
636 |
+
self.executor.shutdown()
|
637 |
+
|
638 |
+
def handle_stop_request(self, triton_id, response_sender):
|
639 |
+
if triton_id is None or triton_id == "":
|
640 |
+
response_sender.send(
|
641 |
+
pb_utils.InferenceResponse(error=pb_utils.TritonError(
|
642 |
+
"A request id must be provided for request cancellation")),
|
643 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
644 |
+
return
|
645 |
+
|
646 |
+
if triton_id in self.triton_id_to_req_id:
|
647 |
+
req_id = self.triton_id_to_req_id[triton_id]
|
648 |
+
self.executor.cancel_request(req_id)
|
649 |
+
|
650 |
+
response_sender.send(
|
651 |
+
pb_utils.InferenceResponse(),
|
652 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
653 |
+
|
654 |
+
def execute(self, requests):
|
655 |
+
"""`execute` must be implemented in every Python model. `execute`
|
656 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
657 |
+
argument. This function is called when an inference is requested
|
658 |
+
for this model.
|
659 |
+
|
660 |
+
Parameters
|
661 |
+
----------
|
662 |
+
requests : list
|
663 |
+
A list of pb_utils.InferenceRequest
|
664 |
+
|
665 |
+
Returns
|
666 |
+
-------
|
667 |
+
list
|
668 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
669 |
+
be the same as `requests`
|
670 |
+
"""
|
671 |
+
if not self.executor.can_enqueue_requests():
|
672 |
+
return
|
673 |
+
|
674 |
+
# Convert to executor requests.
|
675 |
+
triton_requests = []
|
676 |
+
executor_requests = []
|
677 |
+
for request in requests:
|
678 |
+
response_sender = request.get_response_sender()
|
679 |
+
if get_input_scalar_by_name(request, 'stop'):
|
680 |
+
self.handle_stop_request(request.request_id(), response_sender)
|
681 |
+
else:
|
682 |
+
try:
|
683 |
+
converted = convert_request(request,
|
684 |
+
self.exclude_input_from_output,
|
685 |
+
self.decoupled)
|
686 |
+
except Exception as e:
|
687 |
+
response_sender.send(
|
688 |
+
pb_utils.InferenceResponse(error=pb_utils.TritonError(
|
689 |
+
f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
|
690 |
+
)),
|
691 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
692 |
+
else:
|
693 |
+
triton_requests.append(request)
|
694 |
+
executor_requests.append(converted)
|
695 |
+
|
696 |
+
with self.lock:
|
697 |
+
request_ids = self.executor.enqueue_requests(executor_requests)
|
698 |
+
for req_id, request in zip(request_ids, triton_requests):
|
699 |
+
triton_id = request.request_id()
|
700 |
+
self.req_id_to_response_sender[
|
701 |
+
req_id] = triton_id, request.get_response_sender()
|
702 |
+
self.triton_id_to_req_id[triton_id] = req_id
|
703 |
+
return None
|
704 |
+
|
705 |
+
def awaiter_loop(self):
|
706 |
+
"""Gets responses from executor and returns the results."""
|
707 |
+
while self.running:
|
708 |
+
for response in self.executor.await_responses(
|
709 |
+
timeout=datetime.timedelta(milliseconds=1)):
|
710 |
+
req_id = response.request_id
|
711 |
+
with self.lock:
|
712 |
+
if req_id not in self.req_id_to_response_sender:
|
713 |
+
continue
|
714 |
+
triton_id, response_sender = self.req_id_to_response_sender[
|
715 |
+
req_id]
|
716 |
+
|
717 |
+
triton_response, is_final = convert_response(response)
|
718 |
+
response_sender.send(
|
719 |
+
triton_response,
|
720 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
|
721 |
+
if is_final else 0)
|
722 |
+
|
723 |
+
if is_final:
|
724 |
+
with self.lock:
|
725 |
+
del self.triton_id_to_req_id[triton_id]
|
726 |
+
del self.req_id_to_response_sender[req_id]
|
727 |
+
# Remove local reference so response_sender can be cleaned properly.
|
728 |
+
del response_sender
|
729 |
+
|
730 |
+
def cancellation_loop(self):
|
731 |
+
"""Checks if any pending requests have been cancelled."""
|
732 |
+
while self.running:
|
733 |
+
time.sleep(self.cancellation_check_period_ms / 1000.0)
|
734 |
+
with self.lock:
|
735 |
+
for req_id, (triton_id, response_sender
|
736 |
+
) in self.req_id_to_response_sender.items():
|
737 |
+
if response_sender.is_cancelled():
|
738 |
+
self.executor.cancel_request(req_id)
|
739 |
+
# Remove local reference so response_sender can be cleaned properly.
|
740 |
+
del response_sender
|
741 |
+
|
742 |
+
def metrics_loop(self):
|
743 |
+
"""Updates triton metrics using stats from the executor."""
|
744 |
+
while self.running:
|
745 |
+
time.sleep(self.stats_check_period_ms / 1000.0)
|
746 |
+
for stat in self.executor.get_latest_iteration_stats():
|
747 |
+
try:
|
748 |
+
for key, metric in self.all_metrics.items():
|
749 |
+
value = None
|
750 |
+
if hasattr(stat, key):
|
751 |
+
value = getattr(stat, key)
|
752 |
+
elif stat.kv_cache_stats is not None and hasattr(
|
753 |
+
stat.kv_cache_stats, key):
|
754 |
+
value = getattr(stat.kv_cache_stats, key)
|
755 |
+
elif stat.static_batching_stats is not None and hasattr(
|
756 |
+
stat.static_batching_stats, key):
|
757 |
+
value = getattr(stat.static_batching_stats, key)
|
758 |
+
elif stat.inflight_batching_stats is not None and hasattr(
|
759 |
+
stat.inflight_batching_stats, key):
|
760 |
+
value = getattr(stat.inflight_batching_stats, key)
|
761 |
+
if value is not None:
|
762 |
+
if key == "timestamp":
|
763 |
+
value = convert_timestamp_to_seconds(value)
|
764 |
+
metric.set(value)
|
765 |
+
else:
|
766 |
+
pb_utils.Logger.log_warn(
|
767 |
+
f"Metric \"{key}\" not found.")
|
768 |
+
except Exception as e:
|
769 |
+
pb_utils.Logger.log_warn(
|
770 |
+
f"Error while processing metrics: {e}")
|
771 |
+
|
772 |
+
def finalize(self):
|
773 |
+
"""`finalize` is called only once when the model is being unloaded.
|
774 |
+
Implementing `finalize` function is optional. This function allows
|
775 |
+
the model to perform any necessary clean ups before exit.
|
776 |
+
"""
|
777 |
+
if self.executor.can_enqueue_requests():
|
778 |
+
self.running = False
|
779 |
+
self.awaiter_thread.join()
|
780 |
+
self.cancellation_thread.join()
|
781 |
+
self.metrics_thread.join()
|
782 |
+
self.executor.shutdown()
|
tensorrt_llm/1/rank0.engine
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7acd8fbed6cb1b6373e7e00f78cb5910ce6deeb7f5f606ad6cb2123540049bf9
|
3 |
+
size 46722172788
|
tensorrt_llm/1/rank1.engine
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1e59d5a7bb7d8d7c87dcc81f50518d6cb7d6a0e4fde1f60ac194e02fb4ac86e
|
3 |
+
size 46722181404
|
tensorrt_llm/config.pbtxt
ADDED
@@ -0,0 +1,537 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "tensorrt_llm"
|
28 |
+
backend: "tensorrtllm"
|
29 |
+
max_batch_size: 16
|
30 |
+
|
31 |
+
model_transaction_policy {
|
32 |
+
decoupled: true
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
input [
|
37 |
+
{
|
38 |
+
name: "input_ids"
|
39 |
+
data_type: TYPE_INT32
|
40 |
+
dims: [ -1 ]
|
41 |
+
allow_ragged_batch: true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
name: "input_lengths"
|
45 |
+
data_type: TYPE_INT32
|
46 |
+
dims: [ 1 ]
|
47 |
+
reshape: { shape: [ ] }
|
48 |
+
},
|
49 |
+
{
|
50 |
+
name: "request_output_len"
|
51 |
+
data_type: TYPE_INT32
|
52 |
+
dims: [ 1 ]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
name: "draft_input_ids"
|
56 |
+
data_type: TYPE_INT32
|
57 |
+
dims: [ -1 ]
|
58 |
+
optional: true
|
59 |
+
allow_ragged_batch: true
|
60 |
+
},
|
61 |
+
{
|
62 |
+
name: "decoder_input_ids"
|
63 |
+
data_type: TYPE_INT32
|
64 |
+
dims: [ -1 ]
|
65 |
+
optional: true
|
66 |
+
allow_ragged_batch: true
|
67 |
+
},
|
68 |
+
{
|
69 |
+
name: "decoder_input_lengths"
|
70 |
+
data_type: TYPE_INT32
|
71 |
+
dims: [ 1 ]
|
72 |
+
optional: true
|
73 |
+
reshape: { shape: [ ] }
|
74 |
+
},
|
75 |
+
{
|
76 |
+
name: "draft_logits"
|
77 |
+
data_type: TYPE_FP32
|
78 |
+
dims: [ -1, -1 ]
|
79 |
+
optional: true
|
80 |
+
allow_ragged_batch: true
|
81 |
+
},
|
82 |
+
{
|
83 |
+
name: "draft_acceptance_threshold"
|
84 |
+
data_type: TYPE_FP32
|
85 |
+
dims: [ 1 ]
|
86 |
+
reshape: { shape: [ ] }
|
87 |
+
optional: true
|
88 |
+
},
|
89 |
+
{
|
90 |
+
name: "end_id"
|
91 |
+
data_type: TYPE_INT32
|
92 |
+
dims: [ 1 ]
|
93 |
+
reshape: { shape: [ ] }
|
94 |
+
optional: true
|
95 |
+
},
|
96 |
+
{
|
97 |
+
name: "pad_id"
|
98 |
+
data_type: TYPE_INT32
|
99 |
+
dims: [ 1 ]
|
100 |
+
reshape: { shape: [ ] }
|
101 |
+
optional: true
|
102 |
+
},
|
103 |
+
{
|
104 |
+
name: "stop_words_list"
|
105 |
+
data_type: TYPE_INT32
|
106 |
+
dims: [ 2, -1 ]
|
107 |
+
optional: true
|
108 |
+
allow_ragged_batch: true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
name: "bad_words_list"
|
112 |
+
data_type: TYPE_INT32
|
113 |
+
dims: [ 2, -1 ]
|
114 |
+
optional: true
|
115 |
+
allow_ragged_batch: true
|
116 |
+
},
|
117 |
+
{
|
118 |
+
name: "embedding_bias"
|
119 |
+
data_type: TYPE_FP32
|
120 |
+
dims: [ -1 ]
|
121 |
+
optional: true
|
122 |
+
allow_ragged_batch: true
|
123 |
+
},
|
124 |
+
{
|
125 |
+
name: "beam_width"
|
126 |
+
data_type: TYPE_INT32
|
127 |
+
dims: [ 1 ]
|
128 |
+
reshape: { shape: [ ] }
|
129 |
+
optional: true
|
130 |
+
},
|
131 |
+
{
|
132 |
+
name: "temperature"
|
133 |
+
data_type: TYPE_FP32
|
134 |
+
dims: [ 1 ]
|
135 |
+
reshape: { shape: [ ] }
|
136 |
+
optional: true
|
137 |
+
},
|
138 |
+
{
|
139 |
+
name: "runtime_top_k"
|
140 |
+
data_type: TYPE_INT32
|
141 |
+
dims: [ 1 ]
|
142 |
+
reshape: { shape: [ ] }
|
143 |
+
optional: true
|
144 |
+
},
|
145 |
+
{
|
146 |
+
name: "runtime_top_p"
|
147 |
+
data_type: TYPE_FP32
|
148 |
+
dims: [ 1 ]
|
149 |
+
reshape: { shape: [ ] }
|
150 |
+
optional: true
|
151 |
+
},
|
152 |
+
{
|
153 |
+
name: "runtime_top_p_min"
|
154 |
+
data_type: TYPE_FP32
|
155 |
+
dims: [ 1 ]
|
156 |
+
reshape: { shape: [ ] }
|
157 |
+
optional: true
|
158 |
+
},
|
159 |
+
{
|
160 |
+
name: "runtime_top_p_decay"
|
161 |
+
data_type: TYPE_FP32
|
162 |
+
dims: [ 1 ]
|
163 |
+
reshape: { shape: [ ] }
|
164 |
+
optional: true
|
165 |
+
},
|
166 |
+
{
|
167 |
+
name: "runtime_top_p_reset_ids"
|
168 |
+
data_type: TYPE_INT32
|
169 |
+
dims: [ 1 ]
|
170 |
+
reshape: { shape: [ ] }
|
171 |
+
optional: true
|
172 |
+
},
|
173 |
+
{
|
174 |
+
name: "len_penalty"
|
175 |
+
data_type: TYPE_FP32
|
176 |
+
dims: [ 1 ]
|
177 |
+
reshape: { shape: [ ] }
|
178 |
+
optional: true
|
179 |
+
},
|
180 |
+
{
|
181 |
+
name: "early_stopping"
|
182 |
+
data_type: TYPE_BOOL
|
183 |
+
dims: [ 1 ]
|
184 |
+
reshape: { shape: [ ] }
|
185 |
+
optional: true
|
186 |
+
},
|
187 |
+
{
|
188 |
+
name: "repetition_penalty"
|
189 |
+
data_type: TYPE_FP32
|
190 |
+
dims: [ 1 ]
|
191 |
+
reshape: { shape: [ ] }
|
192 |
+
optional: true
|
193 |
+
},
|
194 |
+
{
|
195 |
+
name: "min_length"
|
196 |
+
data_type: TYPE_INT32
|
197 |
+
dims: [ 1 ]
|
198 |
+
reshape: { shape: [ ] }
|
199 |
+
optional: true
|
200 |
+
},
|
201 |
+
{
|
202 |
+
name: "beam_search_diversity_rate"
|
203 |
+
data_type: TYPE_FP32
|
204 |
+
dims: [ 1 ]
|
205 |
+
reshape: { shape: [ ] }
|
206 |
+
optional: true
|
207 |
+
},
|
208 |
+
{
|
209 |
+
name: "presence_penalty"
|
210 |
+
data_type: TYPE_FP32
|
211 |
+
dims: [ 1 ]
|
212 |
+
reshape: { shape: [ ] }
|
213 |
+
optional: true
|
214 |
+
},
|
215 |
+
{
|
216 |
+
name: "frequency_penalty"
|
217 |
+
data_type: TYPE_FP32
|
218 |
+
dims: [ 1 ]
|
219 |
+
reshape: { shape: [ ] }
|
220 |
+
optional: true
|
221 |
+
},
|
222 |
+
{
|
223 |
+
name: "random_seed"
|
224 |
+
data_type: TYPE_UINT64
|
225 |
+
dims: [ 1 ]
|
226 |
+
reshape: { shape: [ ] }
|
227 |
+
optional: true
|
228 |
+
},
|
229 |
+
{
|
230 |
+
name: "return_log_probs"
|
231 |
+
data_type: TYPE_BOOL
|
232 |
+
dims: [ 1 ]
|
233 |
+
reshape: { shape: [ ] }
|
234 |
+
optional: true
|
235 |
+
},
|
236 |
+
{
|
237 |
+
name: "return_context_logits"
|
238 |
+
data_type: TYPE_BOOL
|
239 |
+
dims: [ 1 ]
|
240 |
+
reshape: { shape: [ ] }
|
241 |
+
optional: true
|
242 |
+
},
|
243 |
+
{
|
244 |
+
name: "return_generation_logits"
|
245 |
+
data_type: TYPE_BOOL
|
246 |
+
dims: [ 1 ]
|
247 |
+
reshape: { shape: [ ] }
|
248 |
+
optional: true
|
249 |
+
},
|
250 |
+
{
|
251 |
+
name: "stop"
|
252 |
+
data_type: TYPE_BOOL
|
253 |
+
dims: [ 1 ]
|
254 |
+
optional: true
|
255 |
+
},
|
256 |
+
{
|
257 |
+
name: "streaming"
|
258 |
+
data_type: TYPE_BOOL
|
259 |
+
dims: [ 1 ]
|
260 |
+
optional: true
|
261 |
+
},
|
262 |
+
{
|
263 |
+
name: "prompt_embedding_table"
|
264 |
+
data_type: TYPE_FP16
|
265 |
+
dims: [ -1, -1 ]
|
266 |
+
optional: true
|
267 |
+
allow_ragged_batch: true
|
268 |
+
},
|
269 |
+
{
|
270 |
+
name: "prompt_vocab_size"
|
271 |
+
data_type: TYPE_INT32
|
272 |
+
dims: [ 1 ]
|
273 |
+
reshape: { shape: [ ] }
|
274 |
+
optional: true
|
275 |
+
},
|
276 |
+
# the unique task ID for the given LoRA.
|
277 |
+
# To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
|
278 |
+
# The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
|
279 |
+
# If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
|
280 |
+
{
|
281 |
+
name: "lora_task_id"
|
282 |
+
data_type: TYPE_UINT64
|
283 |
+
dims: [ 1 ]
|
284 |
+
reshape: { shape: [ ] }
|
285 |
+
optional: true
|
286 |
+
},
|
287 |
+
# weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
|
288 |
+
# where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
|
289 |
+
# each of the in / out tensors are first flattened and then concatenated together in the format above.
|
290 |
+
# D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
|
291 |
+
{
|
292 |
+
name: "lora_weights"
|
293 |
+
data_type: TYPE_FP16
|
294 |
+
dims: [ -1, -1 ]
|
295 |
+
optional: true
|
296 |
+
allow_ragged_batch: true
|
297 |
+
},
|
298 |
+
# module identifier (same size a first dimension of lora_weights)
|
299 |
+
# See LoraModule::ModuleType for model id mapping
|
300 |
+
#
|
301 |
+
# "attn_qkv": 0 # compbined qkv adapter
|
302 |
+
# "attn_q": 1 # q adapter
|
303 |
+
# "attn_k": 2 # k adapter
|
304 |
+
# "attn_v": 3 # v adapter
|
305 |
+
# "attn_dense": 4 # adapter for the dense layer in attention
|
306 |
+
# "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
|
307 |
+
# "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
|
308 |
+
# "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
|
309 |
+
#
|
310 |
+
# last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
|
311 |
+
{
|
312 |
+
name: "lora_config"
|
313 |
+
data_type: TYPE_INT32
|
314 |
+
dims: [ -1, 3 ]
|
315 |
+
optional: true
|
316 |
+
allow_ragged_batch: true
|
317 |
+
}
|
318 |
+
]
|
319 |
+
output [
|
320 |
+
{
|
321 |
+
name: "output_ids"
|
322 |
+
data_type: TYPE_INT32
|
323 |
+
dims: [ -1, -1 ]
|
324 |
+
},
|
325 |
+
{
|
326 |
+
name: "sequence_length"
|
327 |
+
data_type: TYPE_INT32
|
328 |
+
dims: [ -1 ]
|
329 |
+
},
|
330 |
+
{
|
331 |
+
name: "cum_log_probs"
|
332 |
+
data_type: TYPE_FP32
|
333 |
+
dims: [ -1 ]
|
334 |
+
},
|
335 |
+
{
|
336 |
+
name: "output_log_probs"
|
337 |
+
data_type: TYPE_FP32
|
338 |
+
dims: [ -1, -1 ]
|
339 |
+
},
|
340 |
+
{
|
341 |
+
name: "context_logits"
|
342 |
+
data_type: TYPE_FP32
|
343 |
+
dims: [ -1, -1 ]
|
344 |
+
},
|
345 |
+
{
|
346 |
+
name: "generation_logits"
|
347 |
+
data_type: TYPE_FP32
|
348 |
+
dims: [ -1, -1, -1 ]
|
349 |
+
}
|
350 |
+
]
|
351 |
+
instance_group [
|
352 |
+
{
|
353 |
+
count: 1
|
354 |
+
kind : KIND_CPU
|
355 |
+
}
|
356 |
+
]
|
357 |
+
parameters: {
|
358 |
+
key: "max_beam_width"
|
359 |
+
value: {
|
360 |
+
string_value: "1"
|
361 |
+
}
|
362 |
+
}
|
363 |
+
parameters: {
|
364 |
+
key: "FORCE_CPU_ONLY_INPUT_TENSORS"
|
365 |
+
value: {
|
366 |
+
string_value: "no"
|
367 |
+
}
|
368 |
+
}
|
369 |
+
parameters: {
|
370 |
+
key: "gpt_model_type"
|
371 |
+
value: {
|
372 |
+
string_value: "inflight_batching"
|
373 |
+
}
|
374 |
+
}
|
375 |
+
parameters: {
|
376 |
+
key: "gpt_model_path"
|
377 |
+
value: {
|
378 |
+
string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
|
379 |
+
}
|
380 |
+
}
|
381 |
+
parameters: {
|
382 |
+
key: "encoder_model_path"
|
383 |
+
value: {
|
384 |
+
string_value: "${encoder_engine_dir}"
|
385 |
+
}
|
386 |
+
}
|
387 |
+
parameters: {
|
388 |
+
key: "max_tokens_in_paged_kv_cache"
|
389 |
+
value: {
|
390 |
+
string_value: "${max_tokens_in_paged_kv_cache}"
|
391 |
+
}
|
392 |
+
}
|
393 |
+
parameters: {
|
394 |
+
key: "max_attention_window_size"
|
395 |
+
value: {
|
396 |
+
string_value: "${max_attention_window_size}"
|
397 |
+
}
|
398 |
+
}
|
399 |
+
parameters: {
|
400 |
+
key: "sink_token_length"
|
401 |
+
value: {
|
402 |
+
string_value: "${sink_token_length}"
|
403 |
+
}
|
404 |
+
}
|
405 |
+
parameters: {
|
406 |
+
key: "batch_scheduler_policy"
|
407 |
+
value: {
|
408 |
+
string_value: "guaranteed_no_evict"
|
409 |
+
}
|
410 |
+
}
|
411 |
+
parameters: {
|
412 |
+
key: "kv_cache_free_gpu_mem_fraction"
|
413 |
+
value: {
|
414 |
+
string_value: "0.8"
|
415 |
+
}
|
416 |
+
}
|
417 |
+
parameters: {
|
418 |
+
key: "kv_cache_host_memory_bytes"
|
419 |
+
value: {
|
420 |
+
string_value: "${kv_cache_host_memory_bytes}"
|
421 |
+
}
|
422 |
+
}
|
423 |
+
parameters: {
|
424 |
+
key: "kv_cache_onboard_blocks"
|
425 |
+
value: {
|
426 |
+
string_value: "${kv_cache_onboard_blocks}"
|
427 |
+
}
|
428 |
+
}
|
429 |
+
# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
|
430 |
+
# parameters: {
|
431 |
+
# key: "enable_trt_overlap"
|
432 |
+
# value: {
|
433 |
+
# string_value: "${enable_trt_overlap}"
|
434 |
+
# }
|
435 |
+
# }
|
436 |
+
parameters: {
|
437 |
+
key: "exclude_input_in_output"
|
438 |
+
value: {
|
439 |
+
string_value: "true"
|
440 |
+
}
|
441 |
+
}
|
442 |
+
parameters: {
|
443 |
+
key: "cancellation_check_period_ms"
|
444 |
+
value: {
|
445 |
+
string_value: "${cancellation_check_period_ms}"
|
446 |
+
}
|
447 |
+
}
|
448 |
+
parameters: {
|
449 |
+
key: "stats_check_period_ms"
|
450 |
+
value: {
|
451 |
+
string_value: "${stats_check_period_ms}"
|
452 |
+
}
|
453 |
+
}
|
454 |
+
parameters: {
|
455 |
+
key: "iter_stats_max_iterations"
|
456 |
+
value: {
|
457 |
+
string_value: "${iter_stats_max_iterations}"
|
458 |
+
}
|
459 |
+
}
|
460 |
+
parameters: {
|
461 |
+
key: "request_stats_max_iterations"
|
462 |
+
value: {
|
463 |
+
string_value: "${request_stats_max_iterations}"
|
464 |
+
}
|
465 |
+
}
|
466 |
+
parameters: {
|
467 |
+
key: "enable_kv_cache_reuse"
|
468 |
+
value: {
|
469 |
+
string_value: "${enable_kv_cache_reuse}"
|
470 |
+
}
|
471 |
+
}
|
472 |
+
parameters: {
|
473 |
+
key: "normalize_log_probs"
|
474 |
+
value: {
|
475 |
+
string_value: "${normalize_log_probs}"
|
476 |
+
}
|
477 |
+
}
|
478 |
+
parameters: {
|
479 |
+
key: "enable_chunked_context"
|
480 |
+
value: {
|
481 |
+
string_value: "${enable_chunked_context}"
|
482 |
+
}
|
483 |
+
}
|
484 |
+
parameters: {
|
485 |
+
key: "gpu_device_ids"
|
486 |
+
value: {
|
487 |
+
string_value: "${gpu_device_ids}"
|
488 |
+
}
|
489 |
+
}
|
490 |
+
parameters: {
|
491 |
+
key: "lora_cache_optimal_adapter_size"
|
492 |
+
value: {
|
493 |
+
string_value: "${lora_cache_optimal_adapter_size}"
|
494 |
+
}
|
495 |
+
}
|
496 |
+
parameters: {
|
497 |
+
key: "lora_cache_max_adapter_size"
|
498 |
+
value: {
|
499 |
+
string_value: "${lora_cache_max_adapter_size}"
|
500 |
+
}
|
501 |
+
}
|
502 |
+
parameters: {
|
503 |
+
key: "lora_cache_gpu_memory_fraction"
|
504 |
+
value: {
|
505 |
+
string_value: "${lora_cache_gpu_memory_fraction}"
|
506 |
+
}
|
507 |
+
}
|
508 |
+
parameters: {
|
509 |
+
key: "lora_cache_host_memory_bytes"
|
510 |
+
value: {
|
511 |
+
string_value: "${lora_cache_host_memory_bytes}"
|
512 |
+
}
|
513 |
+
}
|
514 |
+
parameters: {
|
515 |
+
key: "decoding_mode"
|
516 |
+
value: {
|
517 |
+
string_value: "${decoding_mode}"
|
518 |
+
}
|
519 |
+
}
|
520 |
+
parameters: {
|
521 |
+
key: "executor_worker_path"
|
522 |
+
value: {
|
523 |
+
string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
|
524 |
+
}
|
525 |
+
}
|
526 |
+
parameters: {
|
527 |
+
key: "medusa_choices"
|
528 |
+
value: {
|
529 |
+
string_value: "${medusa_choices}"
|
530 |
+
}
|
531 |
+
}
|
532 |
+
parameters: {
|
533 |
+
key: "gpu_weights_percent"
|
534 |
+
value: {
|
535 |
+
string_value: "${gpu_weights_percent}"
|
536 |
+
}
|
537 |
+
}
|
tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (2.72 kB). View file
|
|
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc
ADDED
Binary file (9.05 kB). View file
|
|
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc
ADDED
Binary file (9.73 kB). View file
|
|
tensorrt_llm_bls/1/lib/decode.py
ADDED
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
from collections.abc import Generator
|
28 |
+
from dataclasses import dataclass
|
29 |
+
from typing import Optional
|
30 |
+
|
31 |
+
import numpy as np
|
32 |
+
|
33 |
+
|
34 |
+
class RequestValidationError(Exception):
|
35 |
+
pass
|
36 |
+
|
37 |
+
|
38 |
+
def _validate_that(condition: bool, msg: str):
|
39 |
+
if not condition:
|
40 |
+
raise RequestValidationError(msg)
|
41 |
+
|
42 |
+
|
43 |
+
def _validate_non_empty(data, msg: str):
|
44 |
+
_validate_that(data is not None and data.size > 0, msg)
|
45 |
+
|
46 |
+
|
47 |
+
def _validate_single_gt_0(data, msg: str):
|
48 |
+
_validate_non_empty(data, msg)
|
49 |
+
_validate_that(data.flatten()[0] > 0, msg)
|
50 |
+
|
51 |
+
|
52 |
+
def _single_value(data: Optional[np.ndarray]):
|
53 |
+
if data is None:
|
54 |
+
return None
|
55 |
+
return data.flatten()[0]
|
56 |
+
|
57 |
+
|
58 |
+
@dataclass
|
59 |
+
class Request:
|
60 |
+
text_input: np.ndarray = np.array([])
|
61 |
+
decoder_text_input: np.ndarray = None
|
62 |
+
max_tokens: np.ndarray = np.array([])
|
63 |
+
bad_words: Optional[np.ndarray] = None
|
64 |
+
stop_words: Optional[np.ndarray] = None
|
65 |
+
end_id: Optional[np.ndarray] = None
|
66 |
+
pad_id: Optional[np.ndarray] = None
|
67 |
+
top_k: Optional[np.ndarray] = None
|
68 |
+
top_p: Optional[np.ndarray] = None
|
69 |
+
temperature: Optional[np.ndarray] = None
|
70 |
+
length_penalty: Optional[np.ndarray] = None
|
71 |
+
repetition_penalty: Optional[np.ndarray] = None
|
72 |
+
min_length: Optional[np.ndarray] = None
|
73 |
+
return_log_probs: Optional[np.ndarray] = None
|
74 |
+
prompt_embedding_table: Optional[np.ndarray] = None
|
75 |
+
prompt_vocab_size: Optional[np.ndarray] = None
|
76 |
+
embedding_bias_words: Optional[np.ndarray] = None
|
77 |
+
embedding_bias_weights: Optional[np.ndarray] = None
|
78 |
+
num_draft_tokens: Optional[np.ndarray] = None
|
79 |
+
use_draft_logits: Optional[np.ndarray] = None
|
80 |
+
stream: Optional[np.ndarray] = None
|
81 |
+
beam_width: Optional[np.ndarray] = None
|
82 |
+
return_context_logits: Optional[np.ndarray] = None
|
83 |
+
return_generation_logits: Optional[np.ndarray] = None
|
84 |
+
random_seed: Optional[np.ndarray] = None
|
85 |
+
presence_penalty: Optional[np.ndarray] = None
|
86 |
+
frequency_penalty: Optional[np.ndarray] = None
|
87 |
+
|
88 |
+
def validate(self):
|
89 |
+
_validate_non_empty(self.text_input, "text_input is required")
|
90 |
+
_validate_single_gt_0(self.max_tokens,
|
91 |
+
"max_tokens must be a single value > 0")
|
92 |
+
|
93 |
+
num_draft_tokens = _single_value(self.num_draft_tokens)
|
94 |
+
stream = _single_value(self.stream)
|
95 |
+
_single_value(self.return_generation_logits)
|
96 |
+
context_logits = _single_value(self.return_context_logits)
|
97 |
+
|
98 |
+
if num_draft_tokens:
|
99 |
+
_validate_that(
|
100 |
+
not stream,
|
101 |
+
"streaming is not supported with speculative decoding")
|
102 |
+
_validate_that(
|
103 |
+
not context_logits,
|
104 |
+
"context logits are not supported with speculative decoding")
|
105 |
+
|
106 |
+
|
107 |
+
@dataclass
|
108 |
+
class DraftRequest:
|
109 |
+
draft_input_ids: Optional[np.ndarray] = None
|
110 |
+
draft_logits: Optional[np.ndarray] = None
|
111 |
+
|
112 |
+
|
113 |
+
@dataclass
|
114 |
+
class PreprocResponse:
|
115 |
+
input_ids: np.ndarray = np.array([])
|
116 |
+
decoder_input_ids: np.ndarray = None
|
117 |
+
input_lengths: np.ndarray = np.array([])
|
118 |
+
decoder_input_lengths: np.ndarray = None
|
119 |
+
bad_words_list: Optional[np.ndarray] = None
|
120 |
+
stop_words_list: Optional[np.ndarray] = None
|
121 |
+
embedding_bias: Optional[np.ndarray] = None
|
122 |
+
end_id: Optional[np.ndarray] = None
|
123 |
+
pad_id: Optional[np.ndarray] = None
|
124 |
+
|
125 |
+
@classmethod
|
126 |
+
def with_new_inputs(cls,
|
127 |
+
other,
|
128 |
+
input_ids: Optional[np.ndarray] = None,
|
129 |
+
input_lengths: Optional[np.ndarray] = None):
|
130 |
+
return cls(
|
131 |
+
input_ids=(input_ids
|
132 |
+
if input_ids is not None else other.input_ids),
|
133 |
+
input_lengths=(input_lengths if input_lengths is not None else
|
134 |
+
other.input_lengths),
|
135 |
+
decoder_input_ids=other.decoder_input_ids,
|
136 |
+
decoder_input_lengths=other.decoder_input_lengths,
|
137 |
+
bad_words_list=other.bad_words_list,
|
138 |
+
stop_words_list=other.stop_words_list,
|
139 |
+
end_id=other.end_id,
|
140 |
+
pad_id=other.pad_id,
|
141 |
+
)
|
142 |
+
|
143 |
+
|
144 |
+
@dataclass
|
145 |
+
class GenerationResponse:
|
146 |
+
output_ids: np.ndarray = np.array([])
|
147 |
+
sequence_length: np.ndarray = np.array([])
|
148 |
+
cum_log_probs: Optional[np.ndarray] = None
|
149 |
+
output_log_probs: Optional[np.ndarray] = None
|
150 |
+
context_logits: Optional[np.ndarray] = None
|
151 |
+
generation_logits: Optional[np.ndarray] = None
|
152 |
+
|
153 |
+
|
154 |
+
@dataclass
|
155 |
+
class Response:
|
156 |
+
text_output: np.ndarray = np.array([])
|
157 |
+
cum_log_probs: Optional[np.ndarray] = None
|
158 |
+
output_log_probs: Optional[np.ndarray] = None
|
159 |
+
context_logits: Optional[np.ndarray] = None
|
160 |
+
generation_logits: Optional[np.ndarray] = None
|
161 |
+
|
162 |
+
def __eq__(self, o) -> bool:
|
163 |
+
"""Just for testing"""
|
164 |
+
if not isinstance(o, Response):
|
165 |
+
return False
|
166 |
+
return (np.array_equal(self.text_output, o.text_output)
|
167 |
+
and np.array_equal(self.cum_log_probs, o.cum_log_probs)
|
168 |
+
and np.array_equal(self.output_log_probs, o.output_log_probs)
|
169 |
+
and np.array_equal(self.context_logits, o.context_logits) and
|
170 |
+
np.array_equal(self.generation_logits, o.generation_logits))
|
171 |
+
|
172 |
+
|
173 |
+
class Decoder:
|
174 |
+
|
175 |
+
def __init__(self, streaming=False, accumulate=False):
|
176 |
+
self._streaming = streaming
|
177 |
+
self._accumulate = accumulate
|
178 |
+
|
179 |
+
self._accumulated_tokens = None
|
180 |
+
|
181 |
+
def decode(self,
|
182 |
+
request: Request,
|
183 |
+
speculative_decoding=False) -> Generator[Response, None, None]:
|
184 |
+
preproc_response = self.preprocess(request)
|
185 |
+
|
186 |
+
if speculative_decoding:
|
187 |
+
for gen_response in self._spec_generate(preproc_response, request):
|
188 |
+
yield self.postprocess(gen_response)
|
189 |
+
else:
|
190 |
+
if not self._streaming:
|
191 |
+
gen_response = self._generate_non_streaming(
|
192 |
+
preproc_response, request)
|
193 |
+
yield self.postprocess(gen_response)
|
194 |
+
else:
|
195 |
+
for gen_response in self._generate(preproc_response, request):
|
196 |
+
yield self.postprocess(gen_response)
|
197 |
+
|
198 |
+
def encountered_stop_words(self, input_ids, stop_words_ids):
|
199 |
+
for stop_word_ids in stop_words_ids:
|
200 |
+
if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
|
201 |
+
return True
|
202 |
+
return False
|
203 |
+
|
204 |
+
def _spec_generate(
|
205 |
+
self, preproc: PreprocResponse,
|
206 |
+
request: Request) -> Generator[GenerationResponse, None, None]:
|
207 |
+
|
208 |
+
prompt_input_ids: np.ndarray = preproc.input_ids[0]
|
209 |
+
input_ids: np.ndarray = prompt_input_ids
|
210 |
+
output_len: int = request.max_tokens[0][0]
|
211 |
+
last_input_ids: np.ndarray = None
|
212 |
+
draft_output_ids: np.ndarray = None
|
213 |
+
draft_logits: np.ndarray = None
|
214 |
+
|
215 |
+
target_response: GenerationResponse = None
|
216 |
+
|
217 |
+
cur_preproc = preproc
|
218 |
+
|
219 |
+
counter = 0
|
220 |
+
while True:
|
221 |
+
counter += 1
|
222 |
+
num_draft_tokens = min(
|
223 |
+
request.num_draft_tokens[0][0],
|
224 |
+
len(prompt_input_ids) + output_len - len(input_ids) - 1)
|
225 |
+
|
226 |
+
draft_request = None
|
227 |
+
if num_draft_tokens > 0:
|
228 |
+
draft_response: GenerationResponse = self._draft_generate_non_streaming(
|
229 |
+
cur_preproc, request, num_draft_tokens)
|
230 |
+
seq_len: int = draft_response.sequence_length[0][0]
|
231 |
+
# [1, beamWidth, outputLength] -> [outputLen]
|
232 |
+
draft_output_ids = draft_response.output_ids[0][0]
|
233 |
+
# [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
|
234 |
+
if request.use_draft_logits is not None and request.use_draft_logits[
|
235 |
+
0]:
|
236 |
+
if draft_response.generation_logits is not None:
|
237 |
+
draft_logits = draft_response.generation_logits[0][0]
|
238 |
+
|
239 |
+
input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
|
240 |
+
draft_request = DraftRequest(
|
241 |
+
draft_input_ids=np.expand_dims(input_draft_tokens, 0))
|
242 |
+
if request.use_draft_logits is not None and request.use_draft_logits[
|
243 |
+
0]:
|
244 |
+
draft_request.draft_logits = np.expand_dims(
|
245 |
+
draft_logits[-len(input_draft_tokens):], 0)
|
246 |
+
else:
|
247 |
+
draft_request = DraftRequest()
|
248 |
+
target_response = self._generate_non_streaming(
|
249 |
+
cur_preproc, request, draft_request)
|
250 |
+
last_input_ids = input_ids
|
251 |
+
input_ids = target_response.output_ids[0][0]
|
252 |
+
cur_preproc = PreprocResponse.with_new_inputs(
|
253 |
+
cur_preproc, np.expand_dims(input_ids, 0),
|
254 |
+
np.array([[len(input_ids)]], dtype=np.int32))
|
255 |
+
|
256 |
+
# Evaluate criteria to stop generation loop.
|
257 |
+
# If we've hit or exceeded the max output length, should stop
|
258 |
+
length_stop = (len(input_ids) >=
|
259 |
+
len(prompt_input_ids) + output_len)
|
260 |
+
if length_stop:
|
261 |
+
break
|
262 |
+
# If draft and target have same outputs, should stop. Normally target should return 1 more token.
|
263 |
+
# If they are the same length, they should differ at the last token
|
264 |
+
target_draft_equal = draft_output_ids is not None and np.array_equal(
|
265 |
+
draft_output_ids, input_ids)
|
266 |
+
if target_draft_equal:
|
267 |
+
break
|
268 |
+
# If tokens no longer change, should stop, means we have hit early stopping
|
269 |
+
last_current_equal = np.array_equal(last_input_ids, input_ids)
|
270 |
+
if last_current_equal:
|
271 |
+
break
|
272 |
+
# Need to check if stop words was encountered
|
273 |
+
hit_stop_words = self.encountered_stop_words(
|
274 |
+
input_ids, preproc.stop_words_list[0])
|
275 |
+
if hit_stop_words:
|
276 |
+
break
|
277 |
+
|
278 |
+
yield target_response
|
279 |
+
|
280 |
+
def _draft_generate_non_streaming(
|
281 |
+
self, preproc: PreprocResponse, request: Request,
|
282 |
+
num_draft_tokens: int) -> GenerationResponse:
|
283 |
+
raise NotImplementedError()
|
284 |
+
|
285 |
+
def _generate(
|
286 |
+
self,
|
287 |
+
preproc: PreprocResponse,
|
288 |
+
request: Request,
|
289 |
+
draft_request: Optional[DraftRequest] = None
|
290 |
+
) -> Generator[GenerationResponse, None, None]:
|
291 |
+
raise NotImplementedError()
|
292 |
+
|
293 |
+
def _generate_non_streaming(
|
294 |
+
self,
|
295 |
+
preproc: PreprocResponse,
|
296 |
+
request: Request,
|
297 |
+
draft_request: Optional[DraftRequest] = None
|
298 |
+
) -> GenerationResponse:
|
299 |
+
raise NotImplementedError()
|
300 |
+
|
301 |
+
def postprocess(self, gen_response: GenerationResponse) -> Response:
|
302 |
+
if self._accumulate and self._streaming:
|
303 |
+
new_tokens: np.ndarray = gen_response.output_ids
|
304 |
+
if new_tokens.ndim != 3:
|
305 |
+
raise Exception("Expected output_ids tensor to have 3 dims.")
|
306 |
+
if new_tokens.shape[0] != 1:
|
307 |
+
raise Exception("Expected batch size of 1")
|
308 |
+
if new_tokens.shape[1] != 1:
|
309 |
+
raise Exception(
|
310 |
+
"Accumulation of tokens is only implemented for beam width = 1"
|
311 |
+
)
|
312 |
+
|
313 |
+
self._accumulated_tokens = new_tokens if (
|
314 |
+
self._accumulated_tokens is None) else np.concatenate(
|
315 |
+
(self._accumulated_tokens, new_tokens), axis=2)
|
316 |
+
sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]],
|
317 |
+
dtype=np.int32)
|
318 |
+
return self._postprocess(self._accumulated_tokens,
|
319 |
+
sequence_lengths, gen_response)
|
320 |
+
else:
|
321 |
+
return self._postprocess(gen_response.output_ids, None,
|
322 |
+
gen_response)
|
323 |
+
|
324 |
+
def _postprocess(self, tokens: np.ndarray,
|
325 |
+
sequence_lengths: Optional[np.ndarray],
|
326 |
+
gen_response: GenerationResponse) -> Response:
|
327 |
+
raise NotImplementedError()
|
328 |
+
|
329 |
+
def preprocess(self, request: Request) -> PreprocResponse:
|
330 |
+
raise NotImplementedError()
|
331 |
+
|
332 |
+
def reset_decoder(self):
|
333 |
+
self._accumulated_tokens = None
|
tensorrt_llm_bls/1/lib/triton_decoder.py
ADDED
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
from collections.abc import Callable
|
28 |
+
from typing import Dict, Optional
|
29 |
+
|
30 |
+
import numpy as np
|
31 |
+
import triton_python_backend_utils as pb_utils
|
32 |
+
from lib.decode import *
|
33 |
+
from typing_extensions import override
|
34 |
+
|
35 |
+
|
36 |
+
class TritonDecoder(Decoder):
|
37 |
+
|
38 |
+
def __init__(self,
|
39 |
+
streaming=False,
|
40 |
+
accumulate=False,
|
41 |
+
preproc_model_name="preprocessing",
|
42 |
+
postproc_model_name="postprocessing",
|
43 |
+
llm_model_name="tensorrt_llm",
|
44 |
+
draft_llm_model_name: Optional[str] = None):
|
45 |
+
super().__init__(streaming=streaming, accumulate=accumulate)
|
46 |
+
self.preproc_model_name = preproc_model_name
|
47 |
+
self.postproc_model_name = postproc_model_name
|
48 |
+
self.llm_model_name = llm_model_name
|
49 |
+
self.draft_llm_model_name = draft_llm_model_name
|
50 |
+
|
51 |
+
self._preproc_outputs = [
|
52 |
+
"INPUT_ID",
|
53 |
+
"DECODER_INPUT_ID",
|
54 |
+
"REQUEST_INPUT_LEN",
|
55 |
+
"REQUEST_DECODER_INPUT_LEN",
|
56 |
+
"BAD_WORDS_IDS",
|
57 |
+
"STOP_WORDS_IDS",
|
58 |
+
"EMBEDDING_BIAS",
|
59 |
+
"OUT_PAD_ID",
|
60 |
+
"OUT_END_ID",
|
61 |
+
]
|
62 |
+
|
63 |
+
self._llm_outputs = [
|
64 |
+
"output_ids",
|
65 |
+
"sequence_length",
|
66 |
+
"cum_log_probs",
|
67 |
+
"output_log_probs",
|
68 |
+
"context_logits",
|
69 |
+
"generation_logits",
|
70 |
+
]
|
71 |
+
|
72 |
+
self._postproc_outputs = [
|
73 |
+
"OUTPUT",
|
74 |
+
]
|
75 |
+
|
76 |
+
self.input_names = [
|
77 |
+
"text_input",
|
78 |
+
"decoder_text_input",
|
79 |
+
"max_tokens",
|
80 |
+
"bad_words",
|
81 |
+
"stop_words",
|
82 |
+
"end_id",
|
83 |
+
"pad_id",
|
84 |
+
"top_k",
|
85 |
+
"top_p",
|
86 |
+
"temperature",
|
87 |
+
"length_penalty",
|
88 |
+
"repetition_penalty",
|
89 |
+
"min_length",
|
90 |
+
"presence_penalty",
|
91 |
+
"frequency_penalty",
|
92 |
+
"random_seed",
|
93 |
+
"return_log_probs",
|
94 |
+
"return_context_logits",
|
95 |
+
"return_generation_logits",
|
96 |
+
"beam_width",
|
97 |
+
"stream",
|
98 |
+
"prompt_embedding_table",
|
99 |
+
"prompt_vocab_size",
|
100 |
+
"embedding_bias_words",
|
101 |
+
"embedding_bias_weights",
|
102 |
+
"num_draft_tokens",
|
103 |
+
"use_draft_logits",
|
104 |
+
]
|
105 |
+
|
106 |
+
self.__undo_reshape_whitelist = {
|
107 |
+
"max_tokens",
|
108 |
+
"end_id",
|
109 |
+
"pad_id",
|
110 |
+
"top_k",
|
111 |
+
"top_p",
|
112 |
+
"temperature",
|
113 |
+
"length_penalty",
|
114 |
+
"repetition_penalty",
|
115 |
+
"min_length",
|
116 |
+
"presence_penalty",
|
117 |
+
"frequency_penalty",
|
118 |
+
"random_seed",
|
119 |
+
"return_log_probs",
|
120 |
+
"return_context_logits",
|
121 |
+
"return_generation_logits",
|
122 |
+
"beam_width",
|
123 |
+
"stream",
|
124 |
+
"prompt_vocab_size",
|
125 |
+
"num_draft_tokens",
|
126 |
+
"use_draft_logits",
|
127 |
+
}
|
128 |
+
|
129 |
+
def _exec_triton_request(self, request):
|
130 |
+
responses = request.exec(decoupled=True)
|
131 |
+
for r in responses:
|
132 |
+
if r.has_error():
|
133 |
+
raise pb_utils.TritonModelException(r.error().message())
|
134 |
+
yield r
|
135 |
+
|
136 |
+
def _exec_triton_request_single(self, request):
|
137 |
+
responses = request.exec(decoupled=False)
|
138 |
+
if responses.has_error():
|
139 |
+
raise pb_utils.TritonModelException(responses.error().message())
|
140 |
+
return responses
|
141 |
+
|
142 |
+
def create_triton_response(self, response: Response):
|
143 |
+
name_map = {
|
144 |
+
"text_output": "text_output",
|
145 |
+
"cum_log_probs": "cum_log_probs",
|
146 |
+
"output_log_probs": "output_log_probs",
|
147 |
+
"context_logits": "context_logits",
|
148 |
+
"generation_logits": "generation_logits"
|
149 |
+
}
|
150 |
+
tensors = self.create_triton_tensors(response, name_map)
|
151 |
+
return pb_utils.InferenceResponse(output_tensors=tensors)
|
152 |
+
|
153 |
+
def convert_triton_request(self, triton_request) -> Request:
|
154 |
+
request = Request()
|
155 |
+
for triton_name in self.input_names:
|
156 |
+
tensor = pb_utils.get_input_tensor_by_name(triton_request,
|
157 |
+
triton_name)
|
158 |
+
target_name = triton_name
|
159 |
+
if tensor is None:
|
160 |
+
continue
|
161 |
+
if not hasattr(request, target_name):
|
162 |
+
raise AttributeError(
|
163 |
+
f"Request has no attribute '{target_name}'")
|
164 |
+
setattr(request, target_name, tensor.as_numpy())
|
165 |
+
return request
|
166 |
+
|
167 |
+
def convert_triton_response(self,
|
168 |
+
triton_response,
|
169 |
+
response_factory: Callable,
|
170 |
+
name_map=None):
|
171 |
+
response = response_factory()
|
172 |
+
for tensor in triton_response.output_tensors():
|
173 |
+
if tensor is None:
|
174 |
+
continue
|
175 |
+
triton_name = tensor.name()
|
176 |
+
value = tensor.as_numpy()
|
177 |
+
target_name = triton_name
|
178 |
+
if name_map and triton_name in name_map:
|
179 |
+
target_name = name_map[triton_name]
|
180 |
+
if name_map and not triton_name in name_map:
|
181 |
+
continue
|
182 |
+
if target_name is None:
|
183 |
+
# explicitly ignore this triton input
|
184 |
+
continue
|
185 |
+
if not hasattr(response, target_name):
|
186 |
+
raise AttributeError(
|
187 |
+
f"response object has not attribute '{target_name}'")
|
188 |
+
setattr(response, target_name, value)
|
189 |
+
return response
|
190 |
+
|
191 |
+
def __undo_reshape(self, x, name):
|
192 |
+
if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
|
193 |
+
# handle reshapes
|
194 |
+
return np.expand_dims(x, 0)
|
195 |
+
else:
|
196 |
+
return x
|
197 |
+
|
198 |
+
def create_triton_tensors(self, obj, name_map: dict):
|
199 |
+
tensors = []
|
200 |
+
for name, triton_name in name_map.items():
|
201 |
+
if triton_name is None:
|
202 |
+
continue
|
203 |
+
value = getattr(obj, name)
|
204 |
+
if value is None:
|
205 |
+
continue
|
206 |
+
t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name))
|
207 |
+
tensors.append(t)
|
208 |
+
return tensors
|
209 |
+
|
210 |
+
@override
|
211 |
+
def preprocess(self, request: Request) -> PreprocResponse:
|
212 |
+
input_tensors = self._get_preproc_tensors(request)
|
213 |
+
triton_req = pb_utils.InferenceRequest(
|
214 |
+
model_name=self.preproc_model_name,
|
215 |
+
inputs=input_tensors,
|
216 |
+
requested_output_names=self._preproc_outputs)
|
217 |
+
triton_output = self._exec_triton_request_single(triton_req)
|
218 |
+
return self._get_preproc_response(triton_output)
|
219 |
+
|
220 |
+
def _get_preproc_tensors(self, request: Request):
|
221 |
+
name_map = {
|
222 |
+
"text_input": "QUERY",
|
223 |
+
"decoder_text_input": "DECODER_QUERY",
|
224 |
+
"max_tokens": "REQUEST_OUTPUT_LEN",
|
225 |
+
"bad_words": "BAD_WORDS_DICT",
|
226 |
+
"stop_words": "STOP_WORDS_DICT",
|
227 |
+
"embedding_bias_words": "EMBEDDING_BIAS_WORDS",
|
228 |
+
"embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
|
229 |
+
"pad_id": "PAD_ID",
|
230 |
+
"end_id": "END_ID",
|
231 |
+
}
|
232 |
+
return self.create_triton_tensors(request, name_map)
|
233 |
+
|
234 |
+
def _get_preproc_response(self, triton_output):
|
235 |
+
name_map = {
|
236 |
+
"INPUT_ID": "input_ids",
|
237 |
+
"DECODER_INPUT_ID": "decoder_input_ids",
|
238 |
+
"REQUEST_INPUT_LEN": "input_lengths",
|
239 |
+
"REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
|
240 |
+
"BAD_WORDS_IDS": "bad_words_list",
|
241 |
+
"STOP_WORDS_IDS": "stop_words_list",
|
242 |
+
"EMBEDDING_BIAS": "embedding_bias",
|
243 |
+
"OUT_PAD_ID": "pad_id",
|
244 |
+
"OUT_END_ID": "end_id",
|
245 |
+
}
|
246 |
+
return self.convert_triton_response(triton_output, PreprocResponse,
|
247 |
+
name_map)
|
248 |
+
|
249 |
+
@override
|
250 |
+
def _draft_generate_non_streaming(
|
251 |
+
self, preproc: PreprocResponse, request: Request,
|
252 |
+
num_draft_tokens: int) -> GenerationResponse:
|
253 |
+
input_tensors = self._get_llm_tensors(preproc, request,
|
254 |
+
num_draft_tokens, None, True)
|
255 |
+
triton_req = pb_utils.InferenceRequest(
|
256 |
+
model_name=self.draft_llm_model_name,
|
257 |
+
inputs=input_tensors,
|
258 |
+
requested_output_names=self._llm_outputs)
|
259 |
+
triton_response = self._exec_triton_request_single(triton_req)
|
260 |
+
llm_response = self._get_llm_response(triton_response)
|
261 |
+
return llm_response
|
262 |
+
|
263 |
+
@override
|
264 |
+
def _generate(
|
265 |
+
self,
|
266 |
+
preproc: PreprocResponse,
|
267 |
+
request: Request,
|
268 |
+
draft_request: Optional[DraftRequest] = None
|
269 |
+
) -> Generator[GenerationResponse, None, None]:
|
270 |
+
input_tensors = self._get_llm_tensors(preproc, request, None,
|
271 |
+
draft_request)
|
272 |
+
triton_req = pb_utils.InferenceRequest(
|
273 |
+
model_name=self.llm_model_name,
|
274 |
+
inputs=input_tensors,
|
275 |
+
requested_output_names=self._llm_outputs)
|
276 |
+
for r in self._exec_triton_request(triton_req):
|
277 |
+
yield self._get_llm_response(r)
|
278 |
+
|
279 |
+
@override
|
280 |
+
def _generate_non_streaming(
|
281 |
+
self,
|
282 |
+
preproc: PreprocResponse,
|
283 |
+
request: Request,
|
284 |
+
draft_request: Optional[DraftRequest] = None
|
285 |
+
) -> GenerationResponse:
|
286 |
+
input_tensors = self._get_llm_tensors(preproc, request, None,
|
287 |
+
draft_request)
|
288 |
+
triton_req = pb_utils.InferenceRequest(
|
289 |
+
model_name=self.llm_model_name,
|
290 |
+
inputs=input_tensors,
|
291 |
+
requested_output_names=self._llm_outputs)
|
292 |
+
r = self._exec_triton_request_single(triton_req)
|
293 |
+
return self._get_llm_response(r)
|
294 |
+
|
295 |
+
def _get_llm_tensors(self,
|
296 |
+
preproc: PreprocResponse,
|
297 |
+
request: Request,
|
298 |
+
num_output_tokens: Optional[int] = None,
|
299 |
+
draft_request: Optional[DraftRequest] = None,
|
300 |
+
is_draft_model_request: bool = False):
|
301 |
+
tensors = []
|
302 |
+
tensors.extend(self._get_tensors_from_preproc(preproc))
|
303 |
+
tensors.extend(
|
304 |
+
self._get_llm_tensors_from_request(request, num_output_tokens,
|
305 |
+
draft_request,
|
306 |
+
is_draft_model_request))
|
307 |
+
return tensors
|
308 |
+
|
309 |
+
def _get_tensors_from_preproc(self, preproc: PreprocResponse):
|
310 |
+
name_map = {
|
311 |
+
"input_ids": "input_ids",
|
312 |
+
"decoder_input_ids": "decoder_input_ids",
|
313 |
+
"input_lengths": "input_lengths",
|
314 |
+
"bad_words_list": "bad_words_list",
|
315 |
+
"stop_words_list": "stop_words_list",
|
316 |
+
"embedding_bias": "embedding_bias",
|
317 |
+
"pad_id": "pad_id",
|
318 |
+
"end_id": "end_id",
|
319 |
+
}
|
320 |
+
return self.create_triton_tensors(preproc, name_map)
|
321 |
+
|
322 |
+
def _get_llm_tensors_from_request(
|
323 |
+
self,
|
324 |
+
request: Request,
|
325 |
+
num_output_tokens: Optional[int] = None,
|
326 |
+
draft_request: Optional[DraftRequest] = None,
|
327 |
+
is_draft_model_request: bool = False):
|
328 |
+
name_map: Dict[str, Optional[str]] = {
|
329 |
+
"beam_width": "beam_width",
|
330 |
+
"top_k": "runtime_top_k",
|
331 |
+
"top_p": "runtime_top_p",
|
332 |
+
"length_penalty": "len_penalty",
|
333 |
+
"repetition_penalty": "repetition_penalty",
|
334 |
+
"min_length": "min_length",
|
335 |
+
"presence_penalty": "presence_penalty",
|
336 |
+
"frequency_penalty": "frequency_penalty",
|
337 |
+
"random_seed": "random_seed",
|
338 |
+
"return_log_probs": "return_log_probs",
|
339 |
+
"stream": "streaming",
|
340 |
+
"prompt_embedding_table": "prompt_embedding_table",
|
341 |
+
"prompt_vocab_size": "prompt_vocab_size",
|
342 |
+
}
|
343 |
+
tensors = self.create_triton_tensors(request, name_map)
|
344 |
+
|
345 |
+
out_len = request.max_tokens[0][0] if request.max_tokens else None
|
346 |
+
if num_output_tokens is not None:
|
347 |
+
out_len = num_output_tokens
|
348 |
+
elif draft_request:
|
349 |
+
if draft_request.draft_input_ids is not None:
|
350 |
+
out_len = len(draft_request.draft_input_ids[0]) + 1
|
351 |
+
else:
|
352 |
+
out_len = 1
|
353 |
+
|
354 |
+
if out_len is None:
|
355 |
+
raise Exception("Could not determine request_output_len")
|
356 |
+
else:
|
357 |
+
tensors.append(
|
358 |
+
pb_utils.Tensor("request_output_len",
|
359 |
+
np.array([[out_len]], dtype=np.int32)))
|
360 |
+
|
361 |
+
if draft_request:
|
362 |
+
if draft_request.draft_input_ids is not None:
|
363 |
+
tensors.append(
|
364 |
+
pb_utils.Tensor("draft_input_ids",
|
365 |
+
draft_request.draft_input_ids))
|
366 |
+
if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
|
367 |
+
0]:
|
368 |
+
tensors.append(
|
369 |
+
pb_utils.Tensor("draft_logits",
|
370 |
+
draft_request.draft_logits))
|
371 |
+
|
372 |
+
return_context_logits = False
|
373 |
+
return_generation_logits = False
|
374 |
+
if draft_request is None:
|
375 |
+
if is_draft_model_request:
|
376 |
+
return_generation_logits = request.use_draft_logits[
|
377 |
+
0] if request.use_draft_logits is not None else False
|
378 |
+
else:
|
379 |
+
return_context_logits = request.return_context_logits[
|
380 |
+
0] if request.return_context_logits is not None else False
|
381 |
+
return_generation_logits = request.return_generation_logits[
|
382 |
+
0] if request.return_generation_logits is not None else False
|
383 |
+
|
384 |
+
tensors.append(
|
385 |
+
pb_utils.Tensor("return_context_logits",
|
386 |
+
np.array([[return_context_logits]])))
|
387 |
+
tensors.append(
|
388 |
+
pb_utils.Tensor("return_generation_logits",
|
389 |
+
np.array([[return_generation_logits]])))
|
390 |
+
return tensors
|
391 |
+
|
392 |
+
def _get_llm_response(self, triton_output):
|
393 |
+
name_map = {
|
394 |
+
"output_ids": "output_ids",
|
395 |
+
"sequence_length": "sequence_length",
|
396 |
+
"cum_log_probs": "cum_log_probs",
|
397 |
+
"output_log_probs": "output_log_probs",
|
398 |
+
"context_logits": "context_logits",
|
399 |
+
"generation_logits": "generation_logits",
|
400 |
+
}
|
401 |
+
return self.convert_triton_response(triton_output, GenerationResponse,
|
402 |
+
name_map)
|
403 |
+
|
404 |
+
def _postprocess(self, tokens: np.ndarray,
|
405 |
+
sequence_lengths: Optional[np.ndarray],
|
406 |
+
gen_response: GenerationResponse) -> Response:
|
407 |
+
input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
|
408 |
+
gen_response)
|
409 |
+
triton_req = pb_utils.InferenceRequest(
|
410 |
+
model_name=self.postproc_model_name,
|
411 |
+
inputs=input_tensors,
|
412 |
+
requested_output_names=self._postproc_outputs)
|
413 |
+
r = self._exec_triton_request_single(triton_req)
|
414 |
+
response = self._get_response(r, gen_response)
|
415 |
+
return response
|
416 |
+
|
417 |
+
def _get_postproc_tensors(self, tokens: np.ndarray,
|
418 |
+
sequence_lengths: Optional[np.ndarray],
|
419 |
+
gen_response: GenerationResponse):
|
420 |
+
tensors = [
|
421 |
+
pb_utils.Tensor("TOKENS_BATCH", tokens),
|
422 |
+
pb_utils.Tensor(
|
423 |
+
"SEQUENCE_LENGTH", sequence_lengths
|
424 |
+
if sequence_lengths else gen_response.sequence_length)
|
425 |
+
]
|
426 |
+
return tensors
|
427 |
+
|
428 |
+
def _get_response(self, triton_output, gen_res: GenerationResponse):
|
429 |
+
tensors = triton_output.output_tensors()
|
430 |
+
t_map = {}
|
431 |
+
for named_t in tensors:
|
432 |
+
name = named_t.name()
|
433 |
+
t = named_t.as_numpy()
|
434 |
+
t_map[name] = t
|
435 |
+
response = Response(text_output=t_map["OUTPUT"],
|
436 |
+
cum_log_probs=gen_res.cum_log_probs,
|
437 |
+
output_log_probs=gen_res.output_log_probs,
|
438 |
+
context_logits=gen_res.context_logits,
|
439 |
+
generation_logits=gen_res.generation_logits)
|
440 |
+
return response
|
tensorrt_llm_bls/1/model.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
import traceback
|
29 |
+
|
30 |
+
import triton_python_backend_utils as pb_utils
|
31 |
+
from lib.triton_decoder import TritonDecoder
|
32 |
+
|
33 |
+
|
34 |
+
class TritonPythonModel:
|
35 |
+
|
36 |
+
def initialize(self, args):
|
37 |
+
|
38 |
+
# Parse model configs
|
39 |
+
model_config = json.loads(args['model_config'])
|
40 |
+
|
41 |
+
params = model_config['parameters']
|
42 |
+
|
43 |
+
accumulate_tokens_str = ''
|
44 |
+
if 'accumulate_tokens' in params:
|
45 |
+
accumulate_tokens_str = params['accumulate_tokens']['string_value']
|
46 |
+
|
47 |
+
self.accumulate_tokens = accumulate_tokens_str.lower() in [
|
48 |
+
'true', 'yes', '1', 't'
|
49 |
+
]
|
50 |
+
|
51 |
+
self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
|
52 |
+
model_config)
|
53 |
+
|
54 |
+
self.logger = pb_utils.Logger
|
55 |
+
|
56 |
+
self.llm_model_name = "tensorrt_llm"
|
57 |
+
if "tensorrt_llm_model_name" in params:
|
58 |
+
self.llm_model_name = params["tensorrt_llm_model_name"][
|
59 |
+
"string_value"]
|
60 |
+
self.draft_llm_model_name = None
|
61 |
+
if "tensorrt_llm_draft_model_name" in params:
|
62 |
+
self.draft_llm_model_name = params[
|
63 |
+
"tensorrt_llm_draft_model_name"]["string_value"]
|
64 |
+
|
65 |
+
self.decoder = TritonDecoder(
|
66 |
+
streaming=self.decoupled,
|
67 |
+
accumulate=self.accumulate_tokens,
|
68 |
+
preproc_model_name="preprocessing",
|
69 |
+
postproc_model_name="postprocessing",
|
70 |
+
llm_model_name=self.llm_model_name,
|
71 |
+
draft_llm_model_name=self.draft_llm_model_name)
|
72 |
+
|
73 |
+
def execute(self, requests):
|
74 |
+
|
75 |
+
responses = []
|
76 |
+
|
77 |
+
for request in requests:
|
78 |
+
if self.decoupled:
|
79 |
+
response_sender = request.get_response_sender()
|
80 |
+
try:
|
81 |
+
|
82 |
+
req = self.decoder.convert_triton_request(request)
|
83 |
+
req.validate()
|
84 |
+
speculative_decode = (req.num_draft_tokens is not None
|
85 |
+
and req.num_draft_tokens[0][0] > 0)
|
86 |
+
if speculative_decode and (self.draft_llm_model_name is None
|
87 |
+
or self.draft_llm_model_name == ""):
|
88 |
+
raise Exception(
|
89 |
+
"cannot perform speculative decoding without draft model"
|
90 |
+
)
|
91 |
+
res_gen = self.decoder.decode(
|
92 |
+
req, speculative_decoding=speculative_decode)
|
93 |
+
|
94 |
+
for res in res_gen:
|
95 |
+
triton_response = self.decoder.create_triton_response(res)
|
96 |
+
if self.decoupled:
|
97 |
+
response_sender.send(triton_response)
|
98 |
+
else:
|
99 |
+
responses.append(triton_response)
|
100 |
+
|
101 |
+
if self.decoupled:
|
102 |
+
response_sender.send(
|
103 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
104 |
+
|
105 |
+
except Exception:
|
106 |
+
self.logger.log_error(traceback.format_exc())
|
107 |
+
# If encountering an error, send a response with err msg
|
108 |
+
error_response = pb_utils.InferenceResponse(
|
109 |
+
output_tensors=[],
|
110 |
+
error=pb_utils.TritonError(traceback.format_exc()))
|
111 |
+
|
112 |
+
if self.decoupled:
|
113 |
+
response_sender.send(error_response)
|
114 |
+
response_sender.send(
|
115 |
+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
|
116 |
+
else:
|
117 |
+
responses.append(error_response)
|
118 |
+
|
119 |
+
self.decoder.reset_decoder()
|
120 |
+
if self.decoupled:
|
121 |
+
return None
|
122 |
+
else:
|
123 |
+
assert len(responses) == len(requests)
|
124 |
+
return responses
|
125 |
+
|
126 |
+
def finalize(self):
|
127 |
+
"""`finalize` is called only once when the model is being unloaded.
|
128 |
+
Implementing `finalize` function is optional. This function allows
|
129 |
+
the model to perform any necessary clean ups before exit.
|
130 |
+
"""
|
131 |
+
print('Cleaning up...')
|
tensorrt_llm_bls/config.pbtxt
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "tensorrt_llm_bls"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 16
|
30 |
+
|
31 |
+
model_transaction_policy {
|
32 |
+
decoupled: true
|
33 |
+
}
|
34 |
+
|
35 |
+
input [
|
36 |
+
{
|
37 |
+
name: "text_input"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ -1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "decoder_text_input"
|
43 |
+
data_type: TYPE_STRING
|
44 |
+
dims: [ -1 ]
|
45 |
+
optional: true
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "max_tokens"
|
49 |
+
data_type: TYPE_INT32
|
50 |
+
dims: [ -1 ]
|
51 |
+
},
|
52 |
+
{
|
53 |
+
name: "bad_words"
|
54 |
+
data_type: TYPE_STRING
|
55 |
+
dims: [ -1 ]
|
56 |
+
optional: true
|
57 |
+
},
|
58 |
+
{
|
59 |
+
name: "stop_words"
|
60 |
+
data_type: TYPE_STRING
|
61 |
+
dims: [ -1 ]
|
62 |
+
optional: true
|
63 |
+
},
|
64 |
+
{
|
65 |
+
name: "end_id"
|
66 |
+
data_type: TYPE_INT32
|
67 |
+
dims: [ 1 ]
|
68 |
+
optional: true
|
69 |
+
},
|
70 |
+
{
|
71 |
+
name: "pad_id"
|
72 |
+
data_type: TYPE_INT32
|
73 |
+
dims: [ 1 ]
|
74 |
+
optional: true
|
75 |
+
},
|
76 |
+
{
|
77 |
+
name: "top_k"
|
78 |
+
data_type: TYPE_INT32
|
79 |
+
dims: [ 1 ]
|
80 |
+
optional: true
|
81 |
+
},
|
82 |
+
{
|
83 |
+
name: "top_p"
|
84 |
+
data_type: TYPE_FP32
|
85 |
+
dims: [ 1 ]
|
86 |
+
optional: true
|
87 |
+
},
|
88 |
+
{
|
89 |
+
name: "temperature"
|
90 |
+
data_type: TYPE_FP32
|
91 |
+
dims: [ 1 ]
|
92 |
+
optional: true
|
93 |
+
},
|
94 |
+
{
|
95 |
+
name: "length_penalty"
|
96 |
+
data_type: TYPE_FP32
|
97 |
+
dims: [ 1 ]
|
98 |
+
optional: true
|
99 |
+
},
|
100 |
+
{
|
101 |
+
name: "repetition_penalty"
|
102 |
+
data_type: TYPE_FP32
|
103 |
+
dims: [ 1 ]
|
104 |
+
optional: true
|
105 |
+
},
|
106 |
+
{
|
107 |
+
name: "min_length"
|
108 |
+
data_type: TYPE_INT32
|
109 |
+
dims: [ 1 ]
|
110 |
+
optional: true
|
111 |
+
},
|
112 |
+
{
|
113 |
+
name: "presence_penalty"
|
114 |
+
data_type: TYPE_FP32
|
115 |
+
dims: [ 1 ]
|
116 |
+
optional: true
|
117 |
+
},
|
118 |
+
{
|
119 |
+
name: "frequency_penalty"
|
120 |
+
data_type: TYPE_FP32
|
121 |
+
dims: [ 1 ]
|
122 |
+
optional: true
|
123 |
+
},
|
124 |
+
{
|
125 |
+
name: "random_seed"
|
126 |
+
data_type: TYPE_UINT64
|
127 |
+
dims: [ 1 ]
|
128 |
+
optional: true
|
129 |
+
},
|
130 |
+
{
|
131 |
+
name: "return_log_probs"
|
132 |
+
data_type: TYPE_BOOL
|
133 |
+
dims: [ 1 ]
|
134 |
+
reshape: { shape: [ ] }
|
135 |
+
optional: true
|
136 |
+
},
|
137 |
+
{
|
138 |
+
name: "return_context_logits"
|
139 |
+
data_type: TYPE_BOOL
|
140 |
+
dims: [ 1 ]
|
141 |
+
reshape: { shape: [ ] }
|
142 |
+
optional: true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
name: "return_generation_logits"
|
146 |
+
data_type: TYPE_BOOL
|
147 |
+
dims: [ 1 ]
|
148 |
+
reshape: { shape: [ ] }
|
149 |
+
optional: true
|
150 |
+
},
|
151 |
+
{
|
152 |
+
name: "beam_width"
|
153 |
+
data_type: TYPE_INT32
|
154 |
+
dims: [ 1 ]
|
155 |
+
optional: true
|
156 |
+
},
|
157 |
+
{
|
158 |
+
name: "stream"
|
159 |
+
data_type: TYPE_BOOL
|
160 |
+
dims: [ 1 ]
|
161 |
+
optional: true
|
162 |
+
},
|
163 |
+
{
|
164 |
+
name: "prompt_embedding_table"
|
165 |
+
data_type: TYPE_FP16
|
166 |
+
dims: [ -1, -1 ]
|
167 |
+
optional: true
|
168 |
+
},
|
169 |
+
{
|
170 |
+
name: "prompt_vocab_size"
|
171 |
+
data_type: TYPE_INT32
|
172 |
+
dims: [ 1 ]
|
173 |
+
optional: true
|
174 |
+
},
|
175 |
+
{
|
176 |
+
name: "embedding_bias_words"
|
177 |
+
data_type: TYPE_STRING
|
178 |
+
dims: [ -1 ]
|
179 |
+
optional: true
|
180 |
+
},
|
181 |
+
{
|
182 |
+
name: "embedding_bias_weights"
|
183 |
+
data_type: TYPE_FP32
|
184 |
+
dims: [ -1 ]
|
185 |
+
optional: true
|
186 |
+
},
|
187 |
+
{
|
188 |
+
name: "num_draft_tokens",
|
189 |
+
data_type: TYPE_INT32,
|
190 |
+
dims: [ 1 ]
|
191 |
+
optional: true
|
192 |
+
},
|
193 |
+
{
|
194 |
+
name: "use_draft_logits",
|
195 |
+
data_type: TYPE_BOOL,
|
196 |
+
dims: [ 1 ]
|
197 |
+
reshape: { shape: [ ] }
|
198 |
+
optional: true
|
199 |
+
}
|
200 |
+
]
|
201 |
+
output [
|
202 |
+
{
|
203 |
+
name: "text_output"
|
204 |
+
data_type: TYPE_STRING
|
205 |
+
dims: [ -1 ]
|
206 |
+
},
|
207 |
+
{
|
208 |
+
name: "cum_log_probs"
|
209 |
+
data_type: TYPE_FP32
|
210 |
+
dims: [ -1 ]
|
211 |
+
},
|
212 |
+
{
|
213 |
+
name: "output_log_probs"
|
214 |
+
data_type: TYPE_FP32
|
215 |
+
dims: [ -1, -1 ]
|
216 |
+
},
|
217 |
+
{
|
218 |
+
name: "context_logits"
|
219 |
+
data_type: TYPE_FP32
|
220 |
+
dims: [ -1, -1 ]
|
221 |
+
},
|
222 |
+
{
|
223 |
+
name: "generation_logits"
|
224 |
+
data_type: TYPE_FP32
|
225 |
+
dims: [ -1, -1, -1 ]
|
226 |
+
}
|
227 |
+
]
|
228 |
+
|
229 |
+
parameters: {
|
230 |
+
key: "accumulate_tokens"
|
231 |
+
value: {
|
232 |
+
string_value: "${accumulate_tokens}"
|
233 |
+
}
|
234 |
+
}
|
235 |
+
parameters: {
|
236 |
+
key: "tensorrt_llm_model_name"
|
237 |
+
value: {
|
238 |
+
string_value: "${tensorrt_llm_model_name}"
|
239 |
+
}
|
240 |
+
}
|
241 |
+
parameters: {
|
242 |
+
key: "tensorrt_llm_draft_model_name"
|
243 |
+
value: {
|
244 |
+
string_value: "${tensorrt_llm_draft_model_name}"
|
245 |
+
}
|
246 |
+
}
|
247 |
+
|
248 |
+
instance_group [
|
249 |
+
{
|
250 |
+
count: 1
|
251 |
+
kind : KIND_CPU
|
252 |
+
}
|
253 |
+
]
|