Devops-hestabit commited on
Commit
61e6a6c
·
verified ·
1 Parent(s): 861b1a0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
37
+ tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text
ensemble/1/.tmp ADDED
File without changes
ensemble/config.pbtxt ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "ensemble"
28
+ platform: "ensemble"
29
+ max_batch_size: 16
30
+ input [
31
+ {
32
+ name: "text_input"
33
+ data_type: TYPE_STRING
34
+ dims: [ -1 ]
35
+ },
36
+ {
37
+ name: "decoder_text_input"
38
+ data_type: TYPE_STRING
39
+ dims: [ -1 ]
40
+ optional: true
41
+ },
42
+ {
43
+ name: "max_tokens"
44
+ data_type: TYPE_INT32
45
+ dims: [ -1 ]
46
+ },
47
+ {
48
+ name: "bad_words"
49
+ data_type: TYPE_STRING
50
+ dims: [ -1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "stop_words"
55
+ data_type: TYPE_STRING
56
+ dims: [ -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "end_id"
61
+ data_type: TYPE_INT32
62
+ dims: [ 1 ]
63
+ optional: true
64
+ },
65
+ {
66
+ name: "pad_id"
67
+ data_type: TYPE_INT32
68
+ dims: [ 1 ]
69
+ optional: true
70
+ },
71
+ {
72
+ name: "top_k"
73
+ data_type: TYPE_INT32
74
+ dims: [ 1 ]
75
+ optional: true
76
+ },
77
+ {
78
+ name: "top_p"
79
+ data_type: TYPE_FP32
80
+ dims: [ 1 ]
81
+ optional: true
82
+ },
83
+ {
84
+ name: "temperature"
85
+ data_type: TYPE_FP32
86
+ dims: [ 1 ]
87
+ optional: true
88
+ },
89
+ {
90
+ name: "length_penalty"
91
+ data_type: TYPE_FP32
92
+ dims: [ 1 ]
93
+ optional: true
94
+ },
95
+ {
96
+ name: "repetition_penalty"
97
+ data_type: TYPE_FP32
98
+ dims: [ 1 ]
99
+ optional: true
100
+ },
101
+ {
102
+ name: "min_length"
103
+ data_type: TYPE_INT32
104
+ dims: [ 1 ]
105
+ optional: true
106
+ },
107
+ {
108
+ name: "presence_penalty"
109
+ data_type: TYPE_FP32
110
+ dims: [ 1 ]
111
+ optional: true
112
+ },
113
+ {
114
+ name: "frequency_penalty"
115
+ data_type: TYPE_FP32
116
+ dims: [ 1 ]
117
+ optional: true
118
+ },
119
+ {
120
+ name: "random_seed"
121
+ data_type: TYPE_UINT64
122
+ dims: [ 1 ]
123
+ optional: true
124
+ },
125
+ {
126
+ name: "return_log_probs"
127
+ data_type: TYPE_BOOL
128
+ dims: [ 1 ]
129
+ optional: true
130
+ },
131
+ {
132
+ name: "return_context_logits"
133
+ data_type: TYPE_BOOL
134
+ dims: [ 1 ]
135
+ optional: true
136
+ },
137
+ {
138
+ name: "return_generation_logits"
139
+ data_type: TYPE_BOOL
140
+ dims: [ 1 ]
141
+ optional: true
142
+ },
143
+ {
144
+ name: "beam_width"
145
+ data_type: TYPE_INT32
146
+ dims: [ 1 ]
147
+ optional: true
148
+ },
149
+ {
150
+ name: "stream"
151
+ data_type: TYPE_BOOL
152
+ dims: [ 1 ]
153
+ optional: true
154
+ },
155
+ {
156
+ name: "prompt_embedding_table"
157
+ data_type: TYPE_FP16
158
+ dims: [ -1, -1 ]
159
+ optional: true
160
+ },
161
+ {
162
+ name: "prompt_vocab_size"
163
+ data_type: TYPE_INT32
164
+ dims: [ 1 ]
165
+ optional: true
166
+ },
167
+ {
168
+ name: "embedding_bias_words"
169
+ data_type: TYPE_STRING
170
+ dims: [ -1 ]
171
+ optional: true
172
+ },
173
+ {
174
+ name: "embedding_bias_weights"
175
+ data_type: TYPE_FP32
176
+ dims: [ -1 ]
177
+ optional: true
178
+ }
179
+ ]
180
+ output [
181
+ {
182
+ name: "text_output"
183
+ data_type: TYPE_STRING
184
+ dims: [ -1 ]
185
+ },
186
+ {
187
+ name: "cum_log_probs"
188
+ data_type: TYPE_FP32
189
+ dims: [ -1 ]
190
+ },
191
+ {
192
+ name: "output_log_probs"
193
+ data_type: TYPE_FP32
194
+ dims: [ -1, -1 ]
195
+ },
196
+ {
197
+ name: "context_logits"
198
+ data_type: TYPE_FP32
199
+ dims: [ -1, -1 ]
200
+ },
201
+ {
202
+ name: "generation_logits"
203
+ data_type: TYPE_FP32
204
+ dims: [ -1, -1, -1 ]
205
+ }
206
+ ]
207
+ ensemble_scheduling {
208
+ step [
209
+ {
210
+ model_name: "preprocessing"
211
+ model_version: -1
212
+ input_map {
213
+ key: "QUERY"
214
+ value: "text_input"
215
+ }
216
+ input_map {
217
+ key: "DECODER_QUERY"
218
+ value: "decoder_text_input"
219
+ }
220
+ input_map {
221
+ key: "REQUEST_OUTPUT_LEN"
222
+ value: "max_tokens"
223
+ }
224
+ input_map {
225
+ key: "BAD_WORDS_DICT"
226
+ value: "bad_words"
227
+ }
228
+ input_map {
229
+ key: "STOP_WORDS_DICT"
230
+ value: "stop_words"
231
+ }
232
+ input_map {
233
+ key: "EMBEDDING_BIAS_WORDS"
234
+ value: "embedding_bias_words"
235
+ }
236
+ input_map {
237
+ key: "EMBEDDING_BIAS_WEIGHTS"
238
+ value: "embedding_bias_weights"
239
+ }
240
+ input_map {
241
+ key: "END_ID"
242
+ value: "end_id"
243
+ }
244
+ input_map {
245
+ key: "PAD_ID"
246
+ value: "pad_id"
247
+ }
248
+ output_map {
249
+ key: "REQUEST_INPUT_LEN"
250
+ value: "_REQUEST_INPUT_LEN"
251
+ }
252
+ output_map {
253
+ key: "INPUT_ID"
254
+ value: "_INPUT_ID"
255
+ }
256
+ output_map {
257
+ key: "REQUEST_DECODER_INPUT_LEN"
258
+ value: "_REQUEST_DECODER_INPUT_LEN"
259
+ }
260
+ output_map {
261
+ key: "DECODER_INPUT_ID"
262
+ value: "_DECODER_INPUT_ID"
263
+ }
264
+ output_map {
265
+ key: "REQUEST_OUTPUT_LEN"
266
+ value: "_REQUEST_OUTPUT_LEN"
267
+ }
268
+ output_map {
269
+ key: "STOP_WORDS_IDS"
270
+ value: "_STOP_WORDS_IDS"
271
+ }
272
+ output_map {
273
+ key: "BAD_WORDS_IDS"
274
+ value: "_BAD_WORDS_IDS"
275
+ }
276
+ output_map {
277
+ key: "EMBEDDING_BIAS"
278
+ value: "_EMBEDDING_BIAS"
279
+ }
280
+ output_map {
281
+ key: "OUT_END_ID"
282
+ value: "_PREPROCESSOR_END_ID"
283
+ }
284
+ output_map {
285
+ key: "OUT_PAD_ID"
286
+ value: "_PREPROCESSOR_PAD_ID"
287
+ }
288
+ },
289
+ {
290
+ model_name: "tensorrt_llm"
291
+ model_version: -1
292
+ input_map {
293
+ key: "input_ids"
294
+ value: "_INPUT_ID"
295
+ }
296
+ input_map {
297
+ key: "decoder_input_ids"
298
+ value: "_DECODER_INPUT_ID"
299
+ }
300
+ input_map {
301
+ key: "input_lengths"
302
+ value: "_REQUEST_INPUT_LEN"
303
+ }
304
+ input_map {
305
+ key: "decoder_input_lengths"
306
+ value: "_REQUEST_DECODER_INPUT_LEN"
307
+ }
308
+ input_map {
309
+ key: "request_output_len"
310
+ value: "_REQUEST_OUTPUT_LEN"
311
+ }
312
+ input_map {
313
+ key: "end_id"
314
+ value: "_PREPROCESSOR_END_ID"
315
+ }
316
+ input_map {
317
+ key: "pad_id"
318
+ value: "_PREPROCESSOR_PAD_ID"
319
+ }
320
+ input_map {
321
+ key: "embedding_bias"
322
+ value: "_EMBEDDING_BIAS"
323
+ }
324
+ input_map {
325
+ key: "runtime_top_k"
326
+ value: "top_k"
327
+ }
328
+ input_map {
329
+ key: "runtime_top_p"
330
+ value: "top_p"
331
+ }
332
+ input_map {
333
+ key: "temperature"
334
+ value: "temperature"
335
+ }
336
+ input_map {
337
+ key: "len_penalty"
338
+ value: "length_penalty"
339
+ }
340
+ input_map {
341
+ key: "repetition_penalty"
342
+ value: "repetition_penalty"
343
+ }
344
+ input_map {
345
+ key: "min_length"
346
+ value: "min_length"
347
+ }
348
+ input_map {
349
+ key: "presence_penalty"
350
+ value: "presence_penalty"
351
+ }
352
+ input_map {
353
+ key: "frequency_penalty"
354
+ value: "frequency_penalty"
355
+ }
356
+ input_map {
357
+ key: "random_seed"
358
+ value: "random_seed"
359
+ }
360
+ input_map {
361
+ key: "return_log_probs"
362
+ value: "return_log_probs"
363
+ }
364
+ input_map {
365
+ key: "return_context_logits"
366
+ value: "return_context_logits"
367
+ }
368
+ input_map {
369
+ key: "return_generation_logits"
370
+ value: "return_generation_logits"
371
+ }
372
+ input_map {
373
+ key: "beam_width"
374
+ value: "beam_width"
375
+ }
376
+ input_map {
377
+ key: "streaming"
378
+ value: "stream"
379
+ }
380
+ input_map {
381
+ key: "prompt_embedding_table"
382
+ value: "prompt_embedding_table"
383
+ }
384
+ input_map {
385
+ key: "prompt_vocab_size"
386
+ value: "prompt_vocab_size"
387
+ }
388
+ input_map {
389
+ key: "stop_words_list"
390
+ value: "_STOP_WORDS_IDS"
391
+ }
392
+ input_map {
393
+ key: "bad_words_list"
394
+ value: "_BAD_WORDS_IDS"
395
+ }
396
+ output_map {
397
+ key: "output_ids"
398
+ value: "_TOKENS_BATCH"
399
+ }
400
+ output_map {
401
+ key: "sequence_length"
402
+ value: "_SEQUENCE_LENGTH"
403
+ },
404
+ output_map {
405
+ key: "cum_log_probs"
406
+ value: "_CUM_LOG_PROBS"
407
+ }
408
+ output_map {
409
+ key: "output_log_probs"
410
+ value: "_OUTPUT_LOG_PROBS"
411
+ },
412
+ output_map {
413
+ key: "context_logits"
414
+ value: "_CONTEXT_LOGITS"
415
+ },
416
+ output_map {
417
+ key: "generation_logits"
418
+ value: "_GENERATION_LOGITS"
419
+ }
420
+ },
421
+ {
422
+ model_name: "postprocessing"
423
+ model_version: -1
424
+ input_map {
425
+ key: "TOKENS_BATCH"
426
+ value: "_TOKENS_BATCH"
427
+ }
428
+ input_map {
429
+ key: "CUM_LOG_PROBS"
430
+ value: "_CUM_LOG_PROBS"
431
+ }
432
+ input_map {
433
+ key: "OUTPUT_LOG_PROBS"
434
+ value: "_OUTPUT_LOG_PROBS"
435
+ }
436
+ input_map {
437
+ key: "CONTEXT_LOGITS"
438
+ value: "_CONTEXT_LOGITS"
439
+ }
440
+ input_map {
441
+ key: "GENERATION_LOGITS"
442
+ value: "_GENERATION_LOGITS"
443
+ }
444
+ input_map {
445
+ key: "SEQUENCE_LENGTH"
446
+ value: "_SEQUENCE_LENGTH"
447
+ }
448
+ output_map {
449
+ key: "OUTPUT"
450
+ value: "text_output"
451
+ }
452
+ output_map {
453
+ key: "OUT_OUTPUT_LOG_PROBS"
454
+ value: "output_log_probs"
455
+ }
456
+ output_map {
457
+ key: "OUT_CUM_LOG_PROBS"
458
+ value: "cum_log_probs"
459
+ }
460
+ output_map {
461
+ key: "OUT_CONTEXT_LOGITS"
462
+ value: "context_logits"
463
+ }
464
+ output_map {
465
+ key: "OUT_GENERATION_LOGITS"
466
+ value: "generation_logits"
467
+ }
468
+ }
469
+ ]
470
+ }
postprocessing/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (5.33 kB). View file
 
postprocessing/1/model.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+
29
+ import numpy as np
30
+ import triton_python_backend_utils as pb_utils
31
+ from transformers import AutoTokenizer
32
+
33
+
34
+ class TritonPythonModel:
35
+ """Your Python model must use the same class name. Every Python model
36
+ that is created must have "TritonPythonModel" as the class name.
37
+ """
38
+
39
+ def initialize(self, args):
40
+ """`initialize` is called only once when the model is being loaded.
41
+ Implementing `initialize` function is optional. This function allows
42
+ the model to initialize any state associated with this model.
43
+ Parameters
44
+ ----------
45
+ args : dict
46
+ Both keys and values are strings. The dictionary keys and values are:
47
+ * model_config: A JSON string containing the model configuration
48
+ * model_instance_kind: A string containing model instance kind
49
+ * model_instance_device_id: A string containing model instance device ID
50
+ * model_repository: Model repository path
51
+ * model_version: Model version
52
+ * model_name: Model name
53
+ """
54
+ # Parse model configs
55
+ model_config = json.loads(args['model_config'])
56
+ tokenizer_dir = model_config['parameters']['tokenizer_dir'][
57
+ 'string_value']
58
+
59
+ skip_special_tokens = model_config['parameters'].get(
60
+ 'skip_special_tokens')
61
+ if skip_special_tokens is not None:
62
+ skip_special_tokens_str = skip_special_tokens[
63
+ 'string_value'].lower()
64
+ if skip_special_tokens_str in [
65
+ 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
66
+ ]:
67
+ self.skip_special_tokens = skip_special_tokens_str in [
68
+ 'true', '1', 't', 'y', 'yes'
69
+ ]
70
+ else:
71
+ print(
72
+ f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
73
+ )
74
+ self.skip_special_tokens = True
75
+ else:
76
+ print(
77
+ f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
78
+ )
79
+ self.skip_special_tokens = True
80
+
81
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
82
+ legacy=False,
83
+ padding_side='left',
84
+ trust_remote_code=True)
85
+ if not self.tokenizer.pad_token:
86
+ self.tokenizer.pad_token = self.tokenizer.eos_token
87
+
88
+ # Parse model output configs
89
+ output_config = pb_utils.get_output_config_by_name(
90
+ model_config, "OUTPUT")
91
+
92
+ # Convert Triton types to numpy types
93
+ self.output_dtype = pb_utils.triton_string_to_numpy(
94
+ output_config['data_type'])
95
+
96
+ def execute(self, requests):
97
+ """`execute` must be implemented in every Python model. `execute`
98
+ function receives a list of pb_utils.InferenceRequest as the only
99
+ argument. This function is called when an inference is requested
100
+ for this model. Depending on the batching configuration (e.g. Dynamic
101
+ Batching) used, `requests` may contain multiple requests. Every
102
+ Python model, must create one pb_utils.InferenceResponse for every
103
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
104
+ set the error argument when creating a pb_utils.InferenceResponse.
105
+ Parameters
106
+ ----------
107
+ requests : list
108
+ A list of pb_utils.InferenceRequest
109
+ Returns
110
+ -------
111
+ list
112
+ A list of pb_utils.InferenceResponse. The length of this list must
113
+ be the same as `requests`
114
+ """
115
+
116
+ responses = []
117
+
118
+ # Every Python backend must iterate over everyone of the requests
119
+ # and create a pb_utils.InferenceResponse for each of them.
120
+ for idx, request in enumerate(requests):
121
+ # Get input tensors
122
+ tokens_batch = pb_utils.get_input_tensor_by_name(
123
+ request, 'TOKENS_BATCH').as_numpy()
124
+
125
+ # Get sequence length
126
+ sequence_lengths = pb_utils.get_input_tensor_by_name(
127
+ request, 'SEQUENCE_LENGTH').as_numpy()
128
+
129
+ # Get cum log probs
130
+ cum_log_probs = pb_utils.get_input_tensor_by_name(
131
+ request, 'CUM_LOG_PROBS')
132
+
133
+ # Get sequence length
134
+ output_log_probs = pb_utils.get_input_tensor_by_name(
135
+ request, 'OUTPUT_LOG_PROBS')
136
+
137
+ # Get context logits
138
+ context_logits = pb_utils.get_input_tensor_by_name(
139
+ request, 'CONTEXT_LOGITS')
140
+
141
+ # Get generation logits
142
+ generation_logits = pb_utils.get_input_tensor_by_name(
143
+ request, 'GENERATION_LOGITS')
144
+
145
+ # Reshape Input
146
+ # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
147
+ # tokens_batch = tokens_batch.T
148
+
149
+ # Postprocessing output data.
150
+ outputs = self._postprocessing(tokens_batch, sequence_lengths)
151
+
152
+ # Create output tensors. You need pb_utils.Tensor
153
+ # objects to create pb_utils.InferenceResponse.
154
+ output_tensor = pb_utils.Tensor(
155
+ 'OUTPUT',
156
+ np.array(outputs).astype(self.output_dtype))
157
+
158
+ outputs = []
159
+ outputs.append(output_tensor)
160
+
161
+ if cum_log_probs:
162
+ out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
163
+ cum_log_probs.as_numpy())
164
+ outputs.append(out_cum_log_probs)
165
+ else:
166
+ out_cum_log_probs = pb_utils.Tensor(
167
+ 'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
168
+ outputs.append(out_cum_log_probs)
169
+
170
+ if output_log_probs:
171
+ out_output_log_probs = pb_utils.Tensor(
172
+ 'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
173
+ outputs.append(out_output_log_probs)
174
+ else:
175
+ out_output_log_probs = pb_utils.Tensor(
176
+ 'OUT_OUTPUT_LOG_PROBS',
177
+ np.array([[[0.0]]], dtype=np.float32))
178
+ outputs.append(out_output_log_probs)
179
+
180
+ if context_logits:
181
+ out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
182
+ context_logits.as_numpy())
183
+ outputs.append(out_context_logits)
184
+ else:
185
+ out_context_logits = pb_utils.Tensor(
186
+ 'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
187
+ dtype=np.float32))
188
+ outputs.append(out_context_logits)
189
+
190
+ if generation_logits:
191
+ out_generation_logits = pb_utils.Tensor(
192
+ 'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
193
+ outputs.append(out_generation_logits)
194
+ else:
195
+ out_generation_logits = pb_utils.Tensor(
196
+ 'OUT_GENERATION_LOGITS',
197
+ np.array([[[[0.0]]]], dtype=np.float32))
198
+ outputs.append(out_generation_logits)
199
+
200
+ # Create InferenceResponse. You can set an error here in case
201
+ # there was a problem with handling this inference request.
202
+ # Below is an example of how you can set errors in inference
203
+ # response:
204
+ #
205
+ # pb_utils.InferenceResponse(
206
+ # output_tensors=..., TritonError("An error occurred"))
207
+ inference_response = pb_utils.InferenceResponse(
208
+ output_tensors=outputs)
209
+ responses.append(inference_response)
210
+
211
+ # You should return a list of pb_utils.InferenceResponse. Length
212
+ # of this list must match the length of `requests` list.
213
+ return responses
214
+
215
+ def finalize(self):
216
+ """`finalize` is called only once when the model is being unloaded.
217
+ Implementing `finalize` function is optional. This function allows
218
+ the model to perform any necessary clean ups before exit.
219
+ """
220
+ print('Cleaning up...')
221
+
222
+ def _postprocessing(self, tokens_batch, sequence_lengths):
223
+ outputs = []
224
+ for batch_idx, beam_tokens in enumerate(tokens_batch):
225
+ for beam_idx, tokens in enumerate(beam_tokens):
226
+ seq_len = sequence_lengths[batch_idx][beam_idx]
227
+ output = self.tokenizer.decode(
228
+ tokens[:seq_len],
229
+ skip_special_tokens=self.skip_special_tokens)
230
+ outputs.append(output.encode('utf8'))
231
+ return outputs
postprocessing/1/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>"
5
+ }
postprocessing/1/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
postprocessing/1/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
postprocessing/1/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": true,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": null,
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false,
42
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
43
+ }
postprocessing/config.pbtxt ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "postprocessing"
28
+ backend: "python"
29
+ max_batch_size: 16
30
+ input [
31
+ {
32
+ name: "TOKENS_BATCH"
33
+ data_type: TYPE_INT32
34
+ dims: [ -1, -1 ]
35
+ },
36
+ {
37
+ name: "SEQUENCE_LENGTH"
38
+ data_type: TYPE_INT32
39
+ dims: [ -1 ]
40
+ },
41
+ {
42
+ name: "CUM_LOG_PROBS"
43
+ data_type: TYPE_FP32
44
+ dims: [ -1 ]
45
+ optional: true
46
+ },
47
+ {
48
+ name: "OUTPUT_LOG_PROBS"
49
+ data_type: TYPE_FP32
50
+ dims: [ -1, -1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "CONTEXT_LOGITS"
55
+ data_type: TYPE_FP32
56
+ dims: [ -1, -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "GENERATION_LOGITS"
61
+ data_type: TYPE_FP32
62
+ dims: [ -1, -1, -1 ]
63
+ optional: true
64
+ }
65
+ ]
66
+ output [
67
+ {
68
+ name: "OUTPUT"
69
+ data_type: TYPE_STRING
70
+ dims: [ -1 ]
71
+ },
72
+ {
73
+ name: "OUT_CUM_LOG_PROBS"
74
+ data_type: TYPE_FP32
75
+ dims: [ -1 ]
76
+ },
77
+ {
78
+ name: "OUT_OUTPUT_LOG_PROBS"
79
+ data_type: TYPE_FP32
80
+ dims: [ -1, -1 ]
81
+ },
82
+ {
83
+ name: "OUT_CONTEXT_LOGITS"
84
+ data_type: TYPE_FP32
85
+ dims: [ -1, -1 ]
86
+ },
87
+ {
88
+ name: "OUT_GENERATION_LOGITS"
89
+ data_type: TYPE_FP32
90
+ dims: [ -1, -1, -1 ]
91
+ }
92
+ ]
93
+
94
+ parameters {
95
+ key: "tokenizer_dir"
96
+ value: {
97
+ string_value: "/all_models/inflight_batcher_llm/postprocessing/1"
98
+ }
99
+ }
100
+
101
+ parameters {
102
+ key: "skip_special_tokens"
103
+ value: {
104
+ string_value: "${skip_special_tokens}"
105
+ }
106
+ }
107
+
108
+ instance_group [
109
+ {
110
+ count: 1
111
+ kind: KIND_CPU
112
+ }
113
+ ]
preprocessing/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (9.56 kB). View file
 
preprocessing/1/model.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+ from typing import List
29
+
30
+ import numpy as np
31
+ import triton_python_backend_utils as pb_utils
32
+ from transformers import AutoTokenizer, T5Tokenizer
33
+
34
+
35
+ class TritonPythonModel:
36
+ """Your Python model must use the same class name. Every Python model
37
+ that is created must have "TritonPythonModel" as the class name.
38
+ """
39
+
40
+ def initialize(self, args):
41
+ """`initialize` is called only once when the model is being loaded.
42
+ Implementing `initialize` function is optional. This function allows
43
+ the model to initialize any state associated with this model.
44
+ Parameters
45
+ ----------
46
+ args : dict
47
+ Both keys and values are strings. The dictionary keys and values are:
48
+ * model_config: A JSON string containing the model configuration
49
+ * model_instance_kind: A string containing model instance kind
50
+ * model_instance_device_id: A string containing model instance device ID
51
+ * model_repository: Model repository path
52
+ * model_version: Model version
53
+ * model_name: Model name
54
+ """
55
+ # Parse model configs
56
+ model_config = json.loads(args['model_config'])
57
+ tokenizer_dir = model_config['parameters']['tokenizer_dir'][
58
+ 'string_value']
59
+
60
+ add_special_tokens = model_config['parameters'].get(
61
+ 'add_special_tokens')
62
+ if add_special_tokens is not None:
63
+ add_special_tokens_str = add_special_tokens['string_value'].lower()
64
+ if add_special_tokens_str in [
65
+ 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
66
+ ]:
67
+ self.add_special_tokens = add_special_tokens_str in [
68
+ 'true', '1', 't', 'y', 'yes'
69
+ ]
70
+ else:
71
+ print(
72
+ f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
73
+ )
74
+ self.add_special_tokens = True
75
+ else:
76
+ print(
77
+ f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
78
+ )
79
+ self.add_special_tokens = True
80
+
81
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
82
+ legacy=False,
83
+ padding_side='left',
84
+ trust_remote_code=True)
85
+ if isinstance(self.tokenizer, T5Tokenizer):
86
+ self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
87
+
88
+ if not self.tokenizer.pad_token:
89
+ self.tokenizer.pad_token = self.tokenizer.eos_token
90
+
91
+ self.tokenizer_end_id = self.tokenizer.encode(
92
+ self.tokenizer.eos_token, add_special_tokens=False)[0]
93
+ self.tokenizer_pad_id = self.tokenizer.encode(
94
+ self.tokenizer.pad_token, add_special_tokens=False)[0]
95
+
96
+ # Parse model output configs and convert Triton types to numpy types
97
+ output_names = [
98
+ "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
99
+ "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
100
+ "OUT_END_ID", "OUT_PAD_ID"
101
+ ]
102
+ input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
103
+ for input_name in input_names:
104
+ setattr(
105
+ self,
106
+ input_name.lower() + "_dtype",
107
+ pb_utils.triton_string_to_numpy(
108
+ pb_utils.get_input_config_by_name(
109
+ model_config, input_name)['data_type']))
110
+
111
+ for output_name in output_names:
112
+ setattr(
113
+ self,
114
+ output_name.lower() + "_dtype",
115
+ pb_utils.triton_string_to_numpy(
116
+ pb_utils.get_output_config_by_name(
117
+ model_config, output_name)['data_type']))
118
+
119
+ def execute(self, requests):
120
+ """`execute` must be implemented in every Python model. `execute`
121
+ function receives a list of pb_utils.InferenceRequest as the only
122
+ argument. This function is called when an inference is requested
123
+ for this model. Depending on the batching configuration (e.g. Dynamic
124
+ Batching) used, `requests` may contain multiple requests. Every
125
+ Python model, must create one pb_utils.InferenceResponse for every
126
+ pb_utils.InferenceRequest in `requests`. If there is an error, you can
127
+ set the error argument when creating a pb_utils.InferenceResponse.
128
+ Parameters
129
+ ----------
130
+ requests : list
131
+ A list of pb_utils.InferenceRequest
132
+ Returns
133
+ -------
134
+ list
135
+ A list of pb_utils.InferenceResponse. The length of this list must
136
+ be the same as `requests`
137
+ """
138
+
139
+ responses = []
140
+
141
+ # Every Python backend must iterate over everyone of the requests
142
+ # and create a pb_utils.InferenceResponse for each of them.
143
+ logger = pb_utils.Logger
144
+ for idx, request in enumerate(requests):
145
+ # Get input tensors
146
+ query = pb_utils.get_input_tensor_by_name(request,
147
+ 'QUERY').as_numpy()
148
+ decoder_query = pb_utils.get_input_tensor_by_name(
149
+ request, 'DECODER_QUERY')
150
+ if decoder_query is not None:
151
+ decoder_query = decoder_query.as_numpy()
152
+
153
+ batch_dim = query.shape[0]
154
+ if batch_dim != 1:
155
+
156
+ err_str = "Inflight batching backend expects requests with batch size of 1."
157
+ logger.log_error(err_str)
158
+ responses.append(
159
+ pb_utils.InferenceResponse(
160
+ output_tensors=[],
161
+ error=pb_utils.TritonError(err_str)))
162
+ continue
163
+
164
+ request_output_len = pb_utils.get_input_tensor_by_name(
165
+ request, 'REQUEST_OUTPUT_LEN').as_numpy()
166
+
167
+ bad_words_dict = pb_utils.get_input_tensor_by_name(
168
+ request, 'BAD_WORDS_DICT')
169
+ if bad_words_dict is not None:
170
+ bad_words_dict = bad_words_dict.as_numpy()
171
+
172
+ stop_words_dict = pb_utils.get_input_tensor_by_name(
173
+ request, 'STOP_WORDS_DICT')
174
+ if stop_words_dict is not None:
175
+ stop_words_dict = stop_words_dict.as_numpy()
176
+
177
+ embedding_bias_words = pb_utils.get_input_tensor_by_name(
178
+ request, 'EMBEDDING_BIAS_WORDS')
179
+ if embedding_bias_words is not None:
180
+ embedding_bias_words = embedding_bias_words.as_numpy()
181
+
182
+ embedding_bias_weights = pb_utils.get_input_tensor_by_name(
183
+ request, 'EMBEDDING_BIAS_WEIGHTS')
184
+ if embedding_bias_weights is not None:
185
+ embedding_bias_weights = embedding_bias_weights.as_numpy()
186
+
187
+ # Take the end_id from the input tensors
188
+ # If not specified, use tokenizer to get end_id
189
+ end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
190
+ if end_id is not None:
191
+ end_id = end_id.as_numpy()
192
+ else:
193
+ end_id = [[self.tokenizer_end_id]]
194
+
195
+ # Take the pad_id from the input tensors
196
+ # If not specified, use tokenizer to get pad_id
197
+ pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
198
+ if pad_id is not None:
199
+ pad_id = pad_id.as_numpy()
200
+ else:
201
+ pad_id = [[self.tokenizer_pad_id]]
202
+
203
+ # Preprocessing input data.
204
+ input_id, request_input_len = self._create_request(query)
205
+ print(input_id)
206
+ print(request_input_len)
207
+ if decoder_query is not None:
208
+ decoder_input_id, request_decoder_input_len = self._create_request(
209
+ decoder_query)
210
+ else:
211
+ decoder_input_id = pad_id * np.ones((1, 1), np.int32)
212
+ request_decoder_input_len = 1 * np.ones((1, 1), np.int32)
213
+
214
+ bad_words = self._to_word_list_format(bad_words_dict)
215
+ stop_words = self._to_word_list_format(stop_words_dict)
216
+
217
+ embedding_bias = self._get_embedding_bias(
218
+ embedding_bias_words, embedding_bias_weights,
219
+ self.embedding_bias_weights_dtype)
220
+
221
+ # Create output tensors. You need pb_utils.Tensor
222
+ # objects to create pb_utils.InferenceResponse.
223
+ input_id_tensor = pb_utils.Tensor(
224
+ 'INPUT_ID', input_id.astype(self.input_id_dtype))
225
+ request_input_len_tensor = pb_utils.Tensor(
226
+ 'REQUEST_INPUT_LEN',
227
+ request_input_len.astype(self.request_input_len_dtype))
228
+ decoder_input_id_tensor = pb_utils.Tensor(
229
+ 'DECODER_INPUT_ID',
230
+ decoder_input_id.astype(self.decoder_input_id_dtype))
231
+ request_decoder_input_len_tensor = pb_utils.Tensor(
232
+ 'REQUEST_DECODER_INPUT_LEN',
233
+ request_decoder_input_len.astype(
234
+ self.request_decoder_input_len_dtype))
235
+ request_output_len_tensor = pb_utils.Tensor(
236
+ 'REQUEST_OUTPUT_LEN', request_output_len)
237
+ bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
238
+ stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
239
+ stop_words)
240
+ embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
241
+ embedding_bias)
242
+ end_id_tensor = pb_utils.Tensor('OUT_END_ID',
243
+ np.array(end_id, dtype=np.int32))
244
+ pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
245
+ np.array(pad_id, dtype=np.int32))
246
+
247
+ inference_response = pb_utils.InferenceResponse(output_tensors=[
248
+ input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
249
+ stop_words_ids_tensor, request_input_len_tensor,
250
+ request_decoder_input_len_tensor, request_output_len_tensor,
251
+ embedding_bias_tensor, end_id_tensor, pad_id_tensor
252
+ ])
253
+ responses.append(inference_response)
254
+
255
+ # You should return a list of pb_utils.InferenceResponse. Length
256
+ # of this list must match the length of `requests` list.
257
+ return responses
258
+
259
+ def finalize(self):
260
+ """`finalize` is called only once when the model is being unloaded.
261
+ Implementing `finalize` function is optional. This function allows
262
+ the model to perform any necessary clean ups before exit.
263
+ """
264
+ print('Cleaning up...')
265
+
266
+ def _create_request(self, query):
267
+ """
268
+ query : batch string (2D numpy array)
269
+ """
270
+ if isinstance(self.tokenizer, T5Tokenizer):
271
+ start_ids = [
272
+ np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
273
+ s[0].decode(), add_special_tokens=self.add_special_tokens)
274
+ ).astype(int) for s in query
275
+ ]
276
+ else:
277
+ start_ids = [
278
+ np.array(
279
+ self.tokenizer.encode(
280
+ s[0].decode(),
281
+ add_special_tokens=self.add_special_tokens)).astype(
282
+ int) for s in query
283
+ ]
284
+ start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
285
+
286
+ max_len = 0
287
+ for seq in start_ids:
288
+ max_len = max(max_len, seq.shape[0])
289
+ start_ids = np.stack([
290
+ np.pad(seq, (0, max_len - seq.shape[0]),
291
+ 'constant',
292
+ constant_values=(0, self.tokenizer_pad_id))
293
+ for seq in start_ids
294
+ ])
295
+
296
+ return start_ids, start_lengths
297
+
298
+ def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
299
+ '''
300
+ word_lists format:
301
+ len(word_lists) == batch_size
302
+ word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
303
+ '''
304
+ assert self.tokenizer != None, "need to set tokenizer"
305
+
306
+ if word_lists is None:
307
+ # Return an empty array of shape (1,2,0)
308
+ return np.empty([1, 2, 0], dtype="int32")
309
+
310
+ flat_ids = []
311
+ offsets = []
312
+ for word_list in word_lists:
313
+ item_flat_ids = []
314
+ item_offsets = []
315
+
316
+ for word in word_list:
317
+ if isinstance(word, bytes):
318
+ word = word.decode()
319
+
320
+ ids = self.tokenizer.encode(word, add_special_tokens=False)
321
+ if len(ids) == 0:
322
+ continue
323
+
324
+ item_flat_ids += ids
325
+ item_offsets.append(len(ids))
326
+
327
+ flat_ids.append(np.array(item_flat_ids))
328
+ offsets.append(np.cumsum(np.array(item_offsets)))
329
+
330
+ pad_to = max(1, max(len(ids) for ids in flat_ids))
331
+
332
+ for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
333
+ flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
334
+ constant_values=0)
335
+ offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
336
+ constant_values=-1)
337
+
338
+ return np.array([flat_ids, offsets], dtype="int32").transpose(
339
+ (1, 0, 2))
340
+
341
+ def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
342
+ bias_dtype):
343
+
344
+ assert self.tokenizer != None, "need to set tokenizer"
345
+
346
+ if embedding_bias_words is None or embedding_bias_weights is None:
347
+ return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
348
+
349
+ batch_embedding_bias = []
350
+ for words, weights in zip(embedding_bias_words,
351
+ embedding_bias_weights):
352
+
353
+ vocab_size = self.tokenizer.vocab_size
354
+ embedding_bias = [0.] * vocab_size
355
+
356
+ assert len(words) == len(
357
+ weights
358
+ ), "Embedding bias words must have same dimension as embedding bias weights"
359
+
360
+ for word, weight in zip(words, weights):
361
+ if isinstance(word, bytes):
362
+ word = word.decode()
363
+ ids = self.tokenizer.encode(word)
364
+
365
+ if len(ids) == 0:
366
+ continue
367
+
368
+ for id in ids:
369
+ embedding_bias[id] += weight
370
+
371
+ batch_embedding_bias.append(np.array(embedding_bias))
372
+
373
+ return np.array(batch_embedding_bias, dtype=bias_dtype)
preprocessing/1/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>"
5
+ }
preprocessing/1/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessing/1/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
preprocessing/1/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": true,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": null,
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false,
42
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
43
+ }
preprocessing/config.pbtxt ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "preprocessing"
28
+ backend: "python"
29
+ max_batch_size: 16
30
+ input [
31
+ {
32
+ name: "QUERY"
33
+ data_type: TYPE_STRING
34
+ dims: [ -1 ]
35
+ },
36
+ {
37
+ name: "DECODER_QUERY"
38
+ data_type: TYPE_STRING
39
+ dims: [ -1 ]
40
+ optional: true
41
+ },
42
+ {
43
+ name: "REQUEST_OUTPUT_LEN"
44
+ data_type: TYPE_INT32
45
+ dims: [ -1 ]
46
+ },
47
+ {
48
+ name: "BAD_WORDS_DICT"
49
+ data_type: TYPE_STRING
50
+ dims: [ -1 ]
51
+ optional: true
52
+ },
53
+ {
54
+ name: "STOP_WORDS_DICT"
55
+ data_type: TYPE_STRING
56
+ dims: [ -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "EMBEDDING_BIAS_WORDS"
61
+ data_type: TYPE_STRING
62
+ dims: [ -1 ]
63
+ optional: true
64
+ },
65
+ {
66
+ name: "EMBEDDING_BIAS_WEIGHTS"
67
+ data_type: TYPE_FP32
68
+ dims: [ -1 ]
69
+ optional: true
70
+ },
71
+ {
72
+ name: "END_ID"
73
+ data_type: TYPE_INT32
74
+ dims: [ -1 ]
75
+ optional: true
76
+ },
77
+ {
78
+ name: "PAD_ID"
79
+ data_type: TYPE_INT32
80
+ dims: [ -1 ]
81
+ optional: true
82
+ }
83
+ ]
84
+ output [
85
+ {
86
+ name: "INPUT_ID"
87
+ data_type: TYPE_INT32
88
+ dims: [ -1 ]
89
+ },
90
+ {
91
+ name: "REQUEST_INPUT_LEN"
92
+ data_type: TYPE_INT32
93
+ dims: [ 1 ]
94
+ },
95
+ {
96
+ name: "DECODER_INPUT_ID"
97
+ data_type: TYPE_INT32
98
+ dims: [ -1 ]
99
+ },
100
+ {
101
+ name: "REQUEST_DECODER_INPUT_LEN"
102
+ data_type: TYPE_INT32
103
+ dims: [ 1 ]
104
+ },
105
+ {
106
+ name: "BAD_WORDS_IDS"
107
+ data_type: TYPE_INT32
108
+ dims: [ 2, -1 ]
109
+ },
110
+ {
111
+ name: "STOP_WORDS_IDS"
112
+ data_type: TYPE_INT32
113
+ dims: [ 2, -1 ]
114
+ },
115
+ {
116
+ name: "EMBEDDING_BIAS"
117
+ data_type: TYPE_FP32
118
+ dims: [ -1 ]
119
+ },
120
+ {
121
+ name: "REQUEST_OUTPUT_LEN"
122
+ data_type: TYPE_INT32
123
+ dims: [ -1 ]
124
+ },
125
+ {
126
+ name: "OUT_END_ID"
127
+ data_type: TYPE_INT32
128
+ dims: [ -1 ]
129
+ },
130
+ {
131
+ name: "OUT_PAD_ID"
132
+ data_type: TYPE_INT32
133
+ dims: [ -1 ]
134
+ }
135
+ ]
136
+
137
+ parameters {
138
+ key: "tokenizer_dir"
139
+ value: {
140
+ string_value: "/all_models/inflight_batcher_llm/postprocessing/1"
141
+ }
142
+ }
143
+
144
+ parameters {
145
+ key: "add_special_tokens"
146
+ value: {
147
+ string_value: "${add_special_tokens}"
148
+ }
149
+ }
150
+
151
+ instance_group [
152
+ {
153
+ count: 1
154
+ kind: KIND_CPU
155
+ }
156
+ ]
tensorrt_llm/1/.gitkeep ADDED
File without changes
tensorrt_llm/1/config.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.11.0.dev2024062500",
3
+ "pretrained_config": {
4
+ "mlp_bias": false,
5
+ "attn_bias": false,
6
+ "rotary_base": 1000000.0,
7
+ "rotary_scaling": null,
8
+ "residual_mlp": false,
9
+ "disable_weight_only_quant_plugin": false,
10
+ "moe": {
11
+ "num_experts": 8,
12
+ "top_k": 2,
13
+ "normalization_mode": 1
14
+ },
15
+ "architecture": "LlamaForCausalLM",
16
+ "dtype": "float16",
17
+ "vocab_size": 32000,
18
+ "hidden_size": 4096,
19
+ "num_hidden_layers": 32,
20
+ "num_attention_heads": 32,
21
+ "hidden_act": "swiglu",
22
+ "logits_dtype": "float32",
23
+ "norm_epsilon": 1e-05,
24
+ "position_embedding_type": "rope_gpt_neox",
25
+ "max_position_embeddings": 32768,
26
+ "num_key_value_heads": 8,
27
+ "intermediate_size": 14336,
28
+ "mapping": {
29
+ "world_size": 1,
30
+ "gpus_per_node": 8,
31
+ "tp_size": 1,
32
+ "pp_size": 1,
33
+ "moe_tp_size": 1,
34
+ "moe_ep_size": 1
35
+ },
36
+ "quantization": {
37
+ "quant_algo": "W8A16",
38
+ "kv_cache_quant_algo": null,
39
+ "group_size": 128,
40
+ "smoothquant_val": null,
41
+ "has_zero_point": false,
42
+ "pre_quant_scale": false,
43
+ "exclude_modules": null
44
+ },
45
+ "use_parallel_embedding": false,
46
+ "embedding_sharding_dim": 0,
47
+ "share_embedding_table": false,
48
+ "head_size": 128,
49
+ "qk_layernorm": false
50
+ },
51
+ "build_config": {
52
+ "max_input_len": 28000,
53
+ "max_seq_len": 32000,
54
+ "opt_batch_size": null,
55
+ "max_batch_size": 16,
56
+ "max_beam_width": 1,
57
+ "max_num_tokens": 32000,
58
+ "opt_num_tokens": 16,
59
+ "max_prompt_embedding_table_size": 0,
60
+ "gather_context_logits": false,
61
+ "gather_generation_logits": false,
62
+ "strongly_typed": true,
63
+ "builder_opt": null,
64
+ "profiling_verbosity": "layer_names_only",
65
+ "enable_debug_output": false,
66
+ "max_draft_len": 0,
67
+ "speculative_decoding_mode": 1,
68
+ "use_refit": false,
69
+ "input_timing_cache": null,
70
+ "output_timing_cache": "model.cache",
71
+ "lora_config": {
72
+ "lora_dir": [],
73
+ "lora_ckpt_source": "hf",
74
+ "max_lora_rank": 64,
75
+ "lora_target_modules": [],
76
+ "trtllm_modules_to_hf_modules": {}
77
+ },
78
+ "auto_parallel_config": {
79
+ "world_size": 1,
80
+ "gpus_per_node": 8,
81
+ "cluster_key": "A100-SXM-80GB",
82
+ "cluster_info": null,
83
+ "sharding_cost_model": "alpha_beta",
84
+ "comm_cost_model": "alpha_beta",
85
+ "enable_pipeline_parallelism": false,
86
+ "enable_shard_unbalanced_shape": false,
87
+ "enable_shard_dynamic_shape": false,
88
+ "enable_reduce_scatter": true,
89
+ "builder_flags": null,
90
+ "debug_mode": false,
91
+ "infer_shape": true,
92
+ "validation_mode": false,
93
+ "same_buffer_io": {
94
+ "past_key_value_(\\d+)": "present_key_value_\\1"
95
+ },
96
+ "same_spec_io": {},
97
+ "sharded_io_allowlist": [
98
+ "past_key_value_\\d+",
99
+ "present_key_value_\\d*"
100
+ ],
101
+ "fill_weights": false,
102
+ "parallel_config_cache": null,
103
+ "profile_cache": null,
104
+ "dump_path": null,
105
+ "debug_outputs": []
106
+ },
107
+ "weight_sparsity": false,
108
+ "weight_streaming": false,
109
+ "plugin_config": {
110
+ "dtype": "float16",
111
+ "bert_attention_plugin": "auto",
112
+ "gpt_attention_plugin": "auto",
113
+ "gemm_plugin": "float16",
114
+ "gemm_swiglu_plugin": null,
115
+ "smooth_quant_gemm_plugin": null,
116
+ "identity_plugin": null,
117
+ "layernorm_quantization_plugin": null,
118
+ "rmsnorm_quantization_plugin": null,
119
+ "nccl_plugin": null,
120
+ "lookup_plugin": null,
121
+ "lora_plugin": null,
122
+ "weight_only_groupwise_quant_matmul_plugin": null,
123
+ "weight_only_quant_matmul_plugin": "float16",
124
+ "quantize_per_token_plugin": false,
125
+ "quantize_tensor_plugin": false,
126
+ "moe_plugin": "auto",
127
+ "mamba_conv1d_plugin": "auto",
128
+ "context_fmha": true,
129
+ "context_fmha_fp32_acc": false,
130
+ "paged_kv_cache": true,
131
+ "remove_input_padding": true,
132
+ "use_custom_all_reduce": true,
133
+ "reduce_fusion": false,
134
+ "multi_block_mode": false,
135
+ "enable_xqa": true,
136
+ "attention_qk_half_accumulation": false,
137
+ "tokens_per_block": 64,
138
+ "use_paged_context_fmha": false,
139
+ "use_fp8_context_fmha": false,
140
+ "multiple_profiles": false,
141
+ "paged_state": true,
142
+ "streamingllm": false
143
+ },
144
+ "use_strip_plan": false,
145
+ "max_encoder_input_len": 1024,
146
+ "use_fused_mlp": false
147
+ }
148
+ }
tensorrt_llm/1/model.py ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import os
4
+ import time
5
+ from threading import Lock, Thread
6
+
7
+ import numpy as np
8
+ import triton_python_backend_utils as pb_utils
9
+ from torch import from_numpy
10
+
11
+ import tensorrt_llm.bindings.executor as trtllm
12
+
13
+
14
+ def get_input_tensor_by_name(request, name):
15
+ tensor = pb_utils.get_input_tensor_by_name(request, name)
16
+ if tensor is None:
17
+ return None
18
+ return tensor.as_numpy()
19
+
20
+
21
+ def get_input_scalar_by_name(request, name):
22
+ tensor = get_input_tensor_by_name(request, name)
23
+ if tensor is None:
24
+ return None
25
+ if tensor.size != 1:
26
+ raise pb_utils.TritonModelException(
27
+ f"Expected a single value for {name}")
28
+ return tensor.item()
29
+
30
+
31
+ def read_parameter_as_type(value, name, pytype=str):
32
+ if value == "":
33
+ return None
34
+ if value.startswith("${") and value.endswith("}"):
35
+ return None
36
+ if pytype is bool:
37
+ return value.lower() in ["1", "true"]
38
+ try:
39
+ result = pytype(value)
40
+ return result
41
+ except:
42
+ pb_utils.Logger.log_warning(
43
+ f"Could not read parameter '{name}' with value '{value}', will use default."
44
+ )
45
+ return None
46
+
47
+
48
+ def get_parameter(model_config, name, pytype=str):
49
+ if name not in model_config['parameters']:
50
+ return None
51
+ return read_parameter_as_type(
52
+ model_config['parameters'][name]['string_value'], name, pytype)
53
+
54
+
55
+ def convert_word_list(word_list):
56
+ if word_list is None:
57
+ return None
58
+ word_list = word_list.tolist()
59
+ if len(word_list) == 0 or len(word_list[0]) != 2:
60
+ raise pb_utils.TritonModelException(f"Invalid format for word list.")
61
+ words, indices = word_list[0]
62
+ result = []
63
+ current_index = 0
64
+ for i in indices:
65
+ if i == -1:
66
+ continue
67
+ if i > len(words):
68
+ raise pb_utils.TritonModelException(
69
+ f"Invalid format for word list.")
70
+ current_word = []
71
+ while current_index < i:
72
+ current_word.append(words[current_index])
73
+ current_index += 1
74
+ result.append(current_word)
75
+ return result
76
+
77
+
78
+ def parse_medusa_choices(medusa_choices):
79
+ if medusa_choices is None:
80
+ return None
81
+ try:
82
+ result = json.loads(
83
+ "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
84
+ assert isinstance(result, list) and len(result) > 0
85
+ assert all([isinstance(x, list) for x in result])
86
+ assert all([isinstance(y, int) for x in result for y in x])
87
+ except Exception:
88
+ raise pb_utils.TritonModelException(
89
+ "Invalid format for medusa_choices")
90
+ return result
91
+
92
+
93
+ def get_sampling_config_from_request(request):
94
+ kwargs = {}
95
+ kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1
96
+ kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k')
97
+ kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p')
98
+ kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
99
+ 'top_p'] <= 0 else kwargs['top_p']
100
+ kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed')
101
+ kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature')
102
+ kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length')
103
+ kwargs['repetition_penalty'] = get_input_scalar_by_name(
104
+ request, 'repetition_penalty')
105
+ kwargs['presence_penalty'] = get_input_scalar_by_name(
106
+ request, 'presence_penalty')
107
+ kwargs['frequency_penalty'] = get_input_scalar_by_name(
108
+ request, 'frequency_penalty')
109
+ kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty')
110
+ kwargs['top_p_min'] = get_input_scalar_by_name(request,
111
+ 'runtime_top_p_min')
112
+ kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
113
+ request, 'runtime_top_p_reset_ids')
114
+ kwargs['top_p_decay'] = get_input_scalar_by_name(request,
115
+ 'runtime_top_p_decay')
116
+ kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
117
+ request, 'beam_search_diversity_rate')
118
+ kwargs['early_stopping'] = get_input_scalar_by_name(
119
+ request, 'early_stopping')
120
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
121
+ return trtllm.SamplingConfig(**kwargs)
122
+
123
+
124
+ def get_output_config_from_request(request, exclude_input_from_output):
125
+ kwargs = {}
126
+ kwargs["return_log_probs"] = get_input_scalar_by_name(
127
+ request, 'return_log_probs')
128
+ kwargs["return_context_logits"] = get_input_scalar_by_name(
129
+ request, 'return_context_logits')
130
+ kwargs["return_generation_logits"] = get_input_scalar_by_name(
131
+ request, 'return_generation_logits')
132
+ kwargs["exclude_input_from_output"] = exclude_input_from_output
133
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
134
+ return trtllm.OutputConfig(**kwargs)
135
+
136
+
137
+ def get_external_draft_tokens_config_from_request(request):
138
+ kwargs = {}
139
+ draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids')
140
+ if draft_input_ids is not None:
141
+ kwargs['tokens'] = draft_input_ids.tolist()
142
+ draft_logits = get_input_tensor_by_name(request, 'draft_logits')
143
+ if draft_logits is not None:
144
+ kwargs['logits'] = from_numpy(draft_logits)
145
+ kwargs['acceptance_threshold'] = get_input_scalar_by_name(
146
+ request, 'draft_acceptance_threshold')
147
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
148
+ if len(kwargs) > 0:
149
+ return trtllm.ExternalDraftTokensConfig(**kwargs)
150
+ return None
151
+
152
+
153
+ def get_prompt_tuning_config_from_request(request):
154
+ # prompt_vocab_size is unused by executor.
155
+ kwargs = {}
156
+ prompt_embedding_table = get_input_tensor_by_name(
157
+ request, 'prompt_embedding_table')
158
+ if prompt_embedding_table is not None:
159
+ kwargs["embedding_table"] = from_numpy(prompt_embedding_table)
160
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
161
+ if len(kwargs) > 0:
162
+ return trtllm.PromptTuningConfig(**kwargs)
163
+ return None
164
+
165
+
166
+ def get_lora_config_from_request(request):
167
+ kwargs = {}
168
+ kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id')
169
+ lora_weights = get_input_tensor_by_name(request, 'lora_weights')
170
+ if lora_weights is not None:
171
+ kwargs["weights"] = from_numpy(lora_weights)
172
+ lora_config = get_input_tensor_by_name(request, 'lora_config')
173
+ if lora_config is not None:
174
+ kwargs["config"] = from_numpy(lora_config)
175
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
176
+ if len(kwargs) > 0:
177
+ return trtllm.LoraConfig(**kwargs)
178
+ return None
179
+
180
+
181
+ def convert_request(request, exclude_input_from_output, decoupled):
182
+ inputs = {}
183
+ input_token_ids = get_input_tensor_by_name(request, 'input_ids')
184
+ if input_token_ids is None:
185
+ raise pb_utils.TritonModelException(
186
+ "A value is required for input_ids")
187
+ input_token_ids = input_token_ids.tolist()
188
+ if len(input_token_ids) == 0:
189
+ raise pb_utils.TritonModelException(f"Invalid format for input_ids")
190
+ inputs['input_token_ids'] = input_token_ids[0]
191
+ # input_lengths is not not used by executor.
192
+ inputs['max_new_tokens'] = get_input_scalar_by_name(
193
+ request, 'request_output_len')
194
+ if inputs['max_new_tokens'] is None:
195
+ raise pb_utils.TritonModelException(
196
+ "A value is required for request_output_len")
197
+ inputs['streaming'] = get_input_scalar_by_name(request, 'streaming')
198
+ if inputs['streaming'] and not decoupled:
199
+ raise pb_utils.TritonModelException(
200
+ "Streaming is only supported in decoupled mode.")
201
+ inputs['end_id'] = get_input_scalar_by_name(request, 'end_id')
202
+ inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id')
203
+ inputs['stop_words'] = convert_word_list(
204
+ get_input_tensor_by_name(request, 'stop_words_list'))
205
+ inputs['bad_words'] = convert_word_list(
206
+ get_input_tensor_by_name(request, 'bad_words_list'))
207
+ embedding_bias = get_input_tensor_by_name(request, 'embedding_bias')
208
+ if embedding_bias is not None and embedding_bias.size != 0:
209
+ inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
210
+
211
+ sampling_config = get_sampling_config_from_request(request)
212
+ output_config = get_output_config_from_request(request,
213
+ exclude_input_from_output)
214
+ external_draft_tokens_config = get_external_draft_tokens_config_from_request(
215
+ request)
216
+ prompt_tuning_config = get_prompt_tuning_config_from_request(request)
217
+ lora_config = get_lora_config_from_request(request)
218
+
219
+ return trtllm.Request(
220
+ **inputs,
221
+ sampling_config=sampling_config,
222
+ output_config=output_config,
223
+ external_draft_tokens_config=external_draft_tokens_config,
224
+ prompt_tuning_config=prompt_tuning_config,
225
+ lora_config=lora_config,
226
+ )
227
+
228
+
229
+ def convert_response(response):
230
+ if response.has_error():
231
+ return pb_utils.InferenceResponse(output_tensors=[],
232
+ error=pb_utils.TritonError(
233
+ response.error_msg)), True
234
+ result = response.result
235
+ beam_lengths = np.expand_dims(
236
+ np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
237
+ max_beam_length = max([len(beam) for beam in result.output_token_ids])
238
+ output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
239
+ -1, np.int32)
240
+ for idx, beam in enumerate(result.output_token_ids):
241
+ output_ids[0, idx, :len(beam)] = beam
242
+ output_tensors = [
243
+ pb_utils.Tensor("output_ids", output_ids),
244
+ pb_utils.Tensor("sequence_length", beam_lengths),
245
+ ]
246
+ output_tensors.append(
247
+ pb_utils.Tensor(
248
+ "cum_log_probs",
249
+ np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
250
+ if result.cum_log_probs is not None else np.zeros(
251
+ (1, 1), np.float32)))
252
+ output_tensors.append(
253
+ pb_utils.Tensor(
254
+ "output_log_probs",
255
+ np.expand_dims(np.array(result.log_probs, np.float32), 0) if
256
+ result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
257
+ output_tensors.append(
258
+ pb_utils.Tensor(
259
+ "context_logits",
260
+ np.expand_dims(np.array(result.context_logits, np.float32), 0)
261
+ if result.context_logits is not None else np.zeros(
262
+ (1, 1, 1), np.float32)))
263
+ output_tensors.append(
264
+ pb_utils.Tensor(
265
+ "generation_logits",
266
+ np.expand_dims(np.array(result.generation_logits, np.float32), 0)
267
+ if result.generation_logits is not None else np.zeros(
268
+ (1, 1, 1, 1), np.float32)))
269
+ return pb_utils.InferenceResponse(output_tensors), result.is_final
270
+
271
+
272
+ def convert_scheduler_policy(batch_scheduler_policy: str):
273
+ if batch_scheduler_policy.lower() == "max_utilization":
274
+ return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
275
+ elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
276
+ return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
277
+ raise pb_utils.TritonModelException(
278
+ f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
279
+ )
280
+
281
+
282
+ def convert_batching_type(gpt_model_type: str):
283
+ if gpt_model_type is None:
284
+ return None
285
+ if gpt_model_type.lower(
286
+ ) == "inflight_fused_batching" or gpt_model_type.lower(
287
+ ) == "inflight_batching":
288
+ return trtllm.BatchingType.INFLIGHT
289
+ elif gpt_model_type.lower() == "v1":
290
+ return trtllm.BatchingType.STATIC
291
+ raise pb_utils.TritonModelException(
292
+ f"gpt_model_type value of '{gpt_model_type}' is not supported.")
293
+
294
+
295
+ def convert_decoding_mode(decoding_mode: str):
296
+ if decoding_mode is None:
297
+ return None
298
+ elif decoding_mode == "auto":
299
+ return trtllm.DecodingMode.Auto()
300
+ elif decoding_mode == "top_k":
301
+ return trtllm.DecodingMode.TopK()
302
+ elif decoding_mode == "top_p":
303
+ return trtllm.DecodingMode.TopP()
304
+ elif decoding_mode == "top_k_top_p":
305
+ return trtllm.DecodingMode.TopKTopP()
306
+ elif decoding_mode == "beam_search":
307
+ return trtllm.DecodingMode.BeamSearch()
308
+ elif decoding_mode == "medusa":
309
+ return trtllm.DecodingMode.Medusa()
310
+ raise pb_utils.TritonModelException(
311
+ f"decoding_mode value of '{decoding_mode}' is not supported.")
312
+
313
+
314
+ def convert_timestamp_to_seconds(timestamp: str):
315
+ return int(
316
+ datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp())
317
+
318
+
319
+ class TritonPythonModel:
320
+ """Your Python model must use the same class name. Every Python model
321
+ that is created must have "TritonPythonModel" as the class name.
322
+ """
323
+
324
+ def get_scheduler_config(self, model_config):
325
+ batch_scheduler_policy = get_parameter(model_config,
326
+ "batch_scheduler_policy")
327
+ if batch_scheduler_policy is None:
328
+ return trtllm.SchedulerConfig()
329
+ return trtllm.SchedulerConfig(
330
+ convert_scheduler_policy(batch_scheduler_policy))
331
+
332
+ def get_kv_cache_config(self, model_config):
333
+ kwargs = {
334
+ "enable_block_reuse":
335
+ get_parameter(model_config, "enable_kv_cache_reuse", bool),
336
+ "max_tokens":
337
+ get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
338
+ "sink_token_length":
339
+ get_parameter(model_config, "sink_token_length", int),
340
+ "max_attention_window":
341
+ get_parameter(model_config, "max_attention_window_size", int),
342
+ "free_gpu_memory_fraction":
343
+ get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
344
+ float),
345
+ "host_cache_size":
346
+ get_parameter(model_config, "kv_cache_host_memory_bytes", int),
347
+ "onboard_blocks":
348
+ get_parameter(model_config, "kv_cache_onboard_blocks", bool),
349
+ }
350
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
351
+ return trtllm.KvCacheConfig(**kwargs)
352
+
353
+ def get_parallel_config(self, model_config):
354
+ kwargs = {}
355
+ gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
356
+ if gpu_device_ids:
357
+ kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
358
+ self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
359
+ "0") == "1"
360
+ if self.use_orchestrator_mode:
361
+ kwargs[
362
+ "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
363
+ worker_path = get_parameter(model_config, "worker_path")
364
+ if worker_path is not None:
365
+ raise pb_utils.TritonModelException(
366
+ "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
367
+ )
368
+ executor_worker_path = get_parameter(model_config,
369
+ "executor_worker_path")
370
+ kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
371
+ True, executor_worker_path)
372
+ if len(kwargs) > 0:
373
+ return trtllm.ParallelConfig(**kwargs)
374
+ return None
375
+
376
+ def get_peft_cache_config(self, model_config):
377
+ kwargs = {
378
+ "optimal_adapter_size":
379
+ get_parameter(model_config, "lora_cache_optimal_adapter_size",
380
+ int),
381
+ "max_adapter_size":
382
+ get_parameter(model_config, "lora_cache_max_adapter_size", int),
383
+ "device_cache_percent":
384
+ get_parameter(model_config, "lora_cache_gpu_memory_fraction",
385
+ float),
386
+ "host_cache_size":
387
+ get_parameter(model_config, "lora_cache_host_memory_bytes", int),
388
+ }
389
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
390
+ return trtllm.PeftCacheConfig(**kwargs)
391
+
392
+ def get_decoding_config(self, model_config):
393
+ kwargs = {
394
+ "medusa_choices":
395
+ parse_medusa_choices(get_parameter(model_config,
396
+ "medusa_choices")),
397
+ "decoding_mode":
398
+ convert_decoding_mode(get_parameter(model_config,
399
+ "decoding_mode")),
400
+ }
401
+ print(kwargs)
402
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
403
+ return trtllm.DecodingConfig(**kwargs)
404
+
405
+ def get_executor_config(self, model_config):
406
+ kwargs = {
407
+ "max_beam_width":
408
+ get_parameter(model_config, "max_beam_width", int),
409
+ "scheduler_config":
410
+ self.get_scheduler_config(model_config),
411
+ "kv_cache_config":
412
+ self.get_kv_cache_config(model_config),
413
+ "enable_chunked_context":
414
+ get_parameter(model_config, "enable_chunked_context", bool),
415
+ "normalize_log_probs":
416
+ get_parameter(model_config, "normalize_log_probs", bool),
417
+ "batching_type":
418
+ convert_batching_type(get_parameter(model_config,
419
+ "gpt_model_type")),
420
+ "parallel_config":
421
+ self.get_parallel_config(model_config),
422
+ "peft_cache_config":
423
+ self.get_peft_cache_config(model_config),
424
+ "decoding_config":
425
+ self.get_decoding_config(model_config),
426
+ }
427
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
428
+ return trtllm.ExecutorConfig(**kwargs)
429
+
430
+ def create_metrics(self, model: str, version: str, is_v1_model: bool):
431
+ self.request_metric_family = pb_utils.MetricFamily(
432
+ name="nv_trt_llm_request_metrics",
433
+ description="TRT LLM request metrics",
434
+ kind=pb_utils.MetricFamily.GAUGE,
435
+ )
436
+ self.runtime_memory_metric_family = pb_utils.MetricFamily(
437
+ name="nv_trt_llm_runtime_memory_metrics",
438
+ description="TRT LLM runtime memory metrics",
439
+ kind=pb_utils.MetricFamily.GAUGE,
440
+ )
441
+ self.kv_cache_metric_family = pb_utils.MetricFamily(
442
+ name="nv_trt_llm_kv_cache_block_metrics",
443
+ description="TRT LLM KV cache block metrics",
444
+ kind=pb_utils.MetricFamily.GAUGE,
445
+ )
446
+ model_type = "v1" if is_v1_model else "inflight_batcher"
447
+ self.model_type_metric_family = pb_utils.MetricFamily(
448
+ name=f"nv_trt_llm_{model_type}_metrics",
449
+ description=f"TRT LLM {model_type}-specific metrics",
450
+ kind=pb_utils.MetricFamily.GAUGE,
451
+ )
452
+ self.general_metric_family = pb_utils.MetricFamily(
453
+ name="nv_trt_llm_general_metrics",
454
+ description="General TRT LLM metrics",
455
+ kind=pb_utils.MetricFamily.GAUGE,
456
+ )
457
+ common_labels = {"model": model, "version": version}
458
+ self.all_metrics = {
459
+ # Request metrics
460
+ "num_active_requests":
461
+ self.request_metric_family.Metric(labels={
462
+ "request_type": "active",
463
+ **common_labels
464
+ }),
465
+ "max_num_active_requests":
466
+ self.request_metric_family.Metric(labels={
467
+ "request_type": "max",
468
+ **common_labels
469
+ }),
470
+ "num_scheduled_requests":
471
+ self.request_metric_family.Metric(labels={
472
+ "request_type": "scheduled",
473
+ **common_labels
474
+ }),
475
+ "num_context_requests":
476
+ self.request_metric_family.Metric(labels={
477
+ "request_type": "context",
478
+ **common_labels
479
+ }),
480
+ # Runtime metrics
481
+ "cpu_mem_usage":
482
+ self.runtime_memory_metric_family.Metric(labels={
483
+ "memory_type": "cpu",
484
+ **common_labels
485
+ }),
486
+ "gpu_mem_usage":
487
+ self.runtime_memory_metric_family.Metric(labels={
488
+ "memory_type": "gpu",
489
+ **common_labels
490
+ }),
491
+ "pinned_mem_usage":
492
+ self.runtime_memory_metric_family.Metric(labels={
493
+ "memory_type": "pinned",
494
+ **common_labels
495
+ }),
496
+ # KV cache metrics
497
+ "max_num_blocks":
498
+ self.kv_cache_metric_family.Metric(labels={
499
+ "kv_cache_block_type": "max",
500
+ **common_labels
501
+ }),
502
+ "free_num_blocks":
503
+ self.kv_cache_metric_family.Metric(labels={
504
+ "kv_cache_block_type": "free",
505
+ **common_labels
506
+ }),
507
+ "used_num_blocks":
508
+ self.kv_cache_metric_family.Metric(labels={
509
+ "kv_cache_block_type": "used",
510
+ **common_labels
511
+ }),
512
+ "tokens_per_block":
513
+ self.kv_cache_metric_family.Metric(labels={
514
+ "kv_cache_block_type": "tokens_per",
515
+ **common_labels
516
+ }),
517
+ # General metrics
518
+ "timestamp":
519
+ self.general_metric_family.Metric(labels={
520
+ "general_type": "timestamp",
521
+ **common_labels
522
+ }),
523
+ "iter":
524
+ self.general_metric_family.Metric(labels={
525
+ "general_type": "iteration_counter",
526
+ **common_labels
527
+ }),
528
+ }
529
+ if is_v1_model:
530
+ self.all_metrics.update({
531
+ "num_ctx_tokens":
532
+ self.model_type_metric_family.Metric(labels={
533
+ "v1_specific_metric": "total_context_tokens",
534
+ **common_labels
535
+ }),
536
+ "num_gen_tokens":
537
+ self.model_type_metric_family.Metric(
538
+ labels={
539
+ "v1_specific_metric": "total_generation_tokens",
540
+ **common_labels
541
+ }),
542
+ "empty_gen_slots":
543
+ self.model_type_metric_family.Metric(
544
+ labels={
545
+ "v1_specific_metric": "empty_generation_slots",
546
+ **common_labels
547
+ }),
548
+ })
549
+ else:
550
+ self.all_metrics.update({
551
+ "num_ctx_tokens":
552
+ self.model_type_metric_family.Metric(
553
+ labels={
554
+ "inflight_batcher_specific_metric":
555
+ "total_context_tokens",
556
+ **common_labels
557
+ }),
558
+ "num_gen_requests":
559
+ self.model_type_metric_family.Metric(
560
+ labels={
561
+ "inflight_batcher_specific_metric":
562
+ "generation_requests",
563
+ **common_labels
564
+ }),
565
+ "micro_batch_id":
566
+ self.model_type_metric_family.Metric(
567
+ labels={
568
+ "inflight_batcher_specific_metric": "micro_batch_id",
569
+ **common_labels
570
+ }),
571
+ "num_paused_requests":
572
+ self.model_type_metric_family.Metric(
573
+ labels={
574
+ "inflight_batcher_specific_metric": "paused_requests",
575
+ **common_labels
576
+ }),
577
+ })
578
+
579
+ def initialize(self, args):
580
+ """`initialize` is called only once when the model is being loaded.
581
+ Implementing `initialize` function is optional. This function allows
582
+ the model to initialize any state associated with this model.
583
+
584
+ Parameters
585
+ ----------
586
+ args : dict
587
+ Both keys and values are strings. The dictionary keys and values are:
588
+ * model_config: A JSON string containing the model configuration
589
+ * model_instance_kind: A string containing model instance kind
590
+ * model_instance_device_id: A string containing model instance device ID
591
+ * model_repository: Model repository path
592
+ * model_version: Model version
593
+ * model_name: Model name
594
+ """
595
+ model_config = json.loads(args['model_config'])
596
+ gpt_model_path = get_parameter(model_config, "gpt_model_path")
597
+ if get_parameter(model_config, "enable_trt_overlap", bool):
598
+ raise pb_utils.TritonModelException(
599
+ f"enable_trt_overlap=true is not supported.")
600
+ self.exclude_input_from_output = get_parameter(
601
+ model_config, "exclude_input_in_output", bool)
602
+ executor_config = self.get_executor_config(model_config)
603
+ self.executor = trtllm.Executor(gpt_model_path,
604
+ trtllm.ModelType.DECODER_ONLY,
605
+ executor_config)
606
+ self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
607
+ model_config)
608
+ self.cancellation_check_period_ms = get_parameter(
609
+ model_config, "cancellation_check_period_ms", int) or 100
610
+ self.stats_check_period_ms = get_parameter(
611
+ model_config, "stats_check_period_ms", int) or 100
612
+
613
+ if not self.decoupled:
614
+ raise pb_utils.TritonModelException(
615
+ "Please enable decoupled transaction policy in the model configuration to serve this model"
616
+ )
617
+
618
+ self.create_metrics(args["model_name"],
619
+ args["model_version"],
620
+ is_v1_model=executor_config.batching_type ==
621
+ trtllm.BatchingType.STATIC)
622
+ self.triton_id_to_req_id = {}
623
+ self.req_id_to_response_sender = {}
624
+ self.lock = Lock()
625
+ self.running = False
626
+ self.awaiter_thread = Thread(target=self.awaiter_loop)
627
+ self.cancellation_thread = Thread(target=self.cancellation_loop)
628
+ self.metrics_thread = Thread(target=self.metrics_loop)
629
+ if self.executor.can_enqueue_requests():
630
+ self.running = True
631
+ self.awaiter_thread.start()
632
+ self.cancellation_thread.start()
633
+ self.metrics_thread.start()
634
+ else:
635
+ # In leader mode, worker ranks will wait here until leader is done.
636
+ self.executor.shutdown()
637
+
638
+ def handle_stop_request(self, triton_id, response_sender):
639
+ if triton_id is None or triton_id == "":
640
+ response_sender.send(
641
+ pb_utils.InferenceResponse(error=pb_utils.TritonError(
642
+ "A request id must be provided for request cancellation")),
643
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
644
+ return
645
+
646
+ if triton_id in self.triton_id_to_req_id:
647
+ req_id = self.triton_id_to_req_id[triton_id]
648
+ self.executor.cancel_request(req_id)
649
+
650
+ response_sender.send(
651
+ pb_utils.InferenceResponse(),
652
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
653
+
654
+ def execute(self, requests):
655
+ """`execute` must be implemented in every Python model. `execute`
656
+ function receives a list of pb_utils.InferenceRequest as the only
657
+ argument. This function is called when an inference is requested
658
+ for this model.
659
+
660
+ Parameters
661
+ ----------
662
+ requests : list
663
+ A list of pb_utils.InferenceRequest
664
+
665
+ Returns
666
+ -------
667
+ list
668
+ A list of pb_utils.InferenceResponse. The length of this list must
669
+ be the same as `requests`
670
+ """
671
+ if not self.executor.can_enqueue_requests():
672
+ return
673
+
674
+ # Convert to executor requests.
675
+ triton_requests = []
676
+ executor_requests = []
677
+ for request in requests:
678
+ response_sender = request.get_response_sender()
679
+ if get_input_scalar_by_name(request, 'stop'):
680
+ self.handle_stop_request(request.request_id(), response_sender)
681
+ else:
682
+ try:
683
+ converted = convert_request(request,
684
+ self.exclude_input_from_output,
685
+ self.decoupled)
686
+ except Exception as e:
687
+ response_sender.send(
688
+ pb_utils.InferenceResponse(error=pb_utils.TritonError(
689
+ f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
690
+ )),
691
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
692
+ else:
693
+ triton_requests.append(request)
694
+ executor_requests.append(converted)
695
+
696
+ with self.lock:
697
+ request_ids = self.executor.enqueue_requests(executor_requests)
698
+ for req_id, request in zip(request_ids, triton_requests):
699
+ triton_id = request.request_id()
700
+ self.req_id_to_response_sender[
701
+ req_id] = triton_id, request.get_response_sender()
702
+ self.triton_id_to_req_id[triton_id] = req_id
703
+ return None
704
+
705
+ def awaiter_loop(self):
706
+ """Gets responses from executor and returns the results."""
707
+ while self.running:
708
+ for response in self.executor.await_responses(
709
+ timeout=datetime.timedelta(milliseconds=1)):
710
+ req_id = response.request_id
711
+ with self.lock:
712
+ if req_id not in self.req_id_to_response_sender:
713
+ continue
714
+ triton_id, response_sender = self.req_id_to_response_sender[
715
+ req_id]
716
+
717
+ triton_response, is_final = convert_response(response)
718
+ response_sender.send(
719
+ triton_response,
720
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
721
+ if is_final else 0)
722
+
723
+ if is_final:
724
+ with self.lock:
725
+ del self.triton_id_to_req_id[triton_id]
726
+ del self.req_id_to_response_sender[req_id]
727
+ # Remove local reference so response_sender can be cleaned properly.
728
+ del response_sender
729
+
730
+ def cancellation_loop(self):
731
+ """Checks if any pending requests have been cancelled."""
732
+ while self.running:
733
+ time.sleep(self.cancellation_check_period_ms / 1000.0)
734
+ with self.lock:
735
+ for req_id, (triton_id, response_sender
736
+ ) in self.req_id_to_response_sender.items():
737
+ if response_sender.is_cancelled():
738
+ self.executor.cancel_request(req_id)
739
+ # Remove local reference so response_sender can be cleaned properly.
740
+ del response_sender
741
+
742
+ def metrics_loop(self):
743
+ """Updates triton metrics using stats from the executor."""
744
+ while self.running:
745
+ time.sleep(self.stats_check_period_ms / 1000.0)
746
+ for stat in self.executor.get_latest_iteration_stats():
747
+ try:
748
+ for key, metric in self.all_metrics.items():
749
+ value = None
750
+ if hasattr(stat, key):
751
+ value = getattr(stat, key)
752
+ elif stat.kv_cache_stats is not None and hasattr(
753
+ stat.kv_cache_stats, key):
754
+ value = getattr(stat.kv_cache_stats, key)
755
+ elif stat.static_batching_stats is not None and hasattr(
756
+ stat.static_batching_stats, key):
757
+ value = getattr(stat.static_batching_stats, key)
758
+ elif stat.inflight_batching_stats is not None and hasattr(
759
+ stat.inflight_batching_stats, key):
760
+ value = getattr(stat.inflight_batching_stats, key)
761
+ if value is not None:
762
+ if key == "timestamp":
763
+ value = convert_timestamp_to_seconds(value)
764
+ metric.set(value)
765
+ else:
766
+ pb_utils.Logger.log_warn(
767
+ f"Metric \"{key}\" not found.")
768
+ except Exception as e:
769
+ pb_utils.Logger.log_warn(
770
+ f"Error while processing metrics: {e}")
771
+
772
+ def finalize(self):
773
+ """`finalize` is called only once when the model is being unloaded.
774
+ Implementing `finalize` function is optional. This function allows
775
+ the model to perform any necessary clean ups before exit.
776
+ """
777
+ if self.executor.can_enqueue_requests():
778
+ self.running = False
779
+ self.awaiter_thread.join()
780
+ self.cancellation_thread.join()
781
+ self.metrics_thread.join()
782
+ self.executor.shutdown()
tensorrt_llm/1/rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c545694cbc76c5a65d4650a2d7897cc98ab2382bce3d198acfa97c003bfea6c
3
+ size 47006220780
tensorrt_llm/config.pbtxt ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "tensorrt_llm"
28
+ backend: "tensorrtllm"
29
+ max_batch_size: 16
30
+
31
+ model_transaction_policy {
32
+ decoupled: true
33
+ }
34
+
35
+
36
+ input [
37
+ {
38
+ name: "input_ids"
39
+ data_type: TYPE_INT32
40
+ dims: [ -1 ]
41
+ allow_ragged_batch: true
42
+ },
43
+ {
44
+ name: "input_lengths"
45
+ data_type: TYPE_INT32
46
+ dims: [ 1 ]
47
+ reshape: { shape: [ ] }
48
+ },
49
+ {
50
+ name: "request_output_len"
51
+ data_type: TYPE_INT32
52
+ dims: [ 1 ]
53
+ },
54
+ {
55
+ name: "draft_input_ids"
56
+ data_type: TYPE_INT32
57
+ dims: [ -1 ]
58
+ optional: true
59
+ allow_ragged_batch: true
60
+ },
61
+ {
62
+ name: "decoder_input_ids"
63
+ data_type: TYPE_INT32
64
+ dims: [ -1 ]
65
+ optional: true
66
+ allow_ragged_batch: true
67
+ },
68
+ {
69
+ name: "decoder_input_lengths"
70
+ data_type: TYPE_INT32
71
+ dims: [ 1 ]
72
+ optional: true
73
+ reshape: { shape: [ ] }
74
+ },
75
+ {
76
+ name: "draft_logits"
77
+ data_type: TYPE_FP32
78
+ dims: [ -1, -1 ]
79
+ optional: true
80
+ allow_ragged_batch: true
81
+ },
82
+ {
83
+ name: "draft_acceptance_threshold"
84
+ data_type: TYPE_FP32
85
+ dims: [ 1 ]
86
+ reshape: { shape: [ ] }
87
+ optional: true
88
+ },
89
+ {
90
+ name: "end_id"
91
+ data_type: TYPE_INT32
92
+ dims: [ 1 ]
93
+ reshape: { shape: [ ] }
94
+ optional: true
95
+ },
96
+ {
97
+ name: "pad_id"
98
+ data_type: TYPE_INT32
99
+ dims: [ 1 ]
100
+ reshape: { shape: [ ] }
101
+ optional: true
102
+ },
103
+ {
104
+ name: "stop_words_list"
105
+ data_type: TYPE_INT32
106
+ dims: [ 2, -1 ]
107
+ optional: true
108
+ allow_ragged_batch: true
109
+ },
110
+ {
111
+ name: "bad_words_list"
112
+ data_type: TYPE_INT32
113
+ dims: [ 2, -1 ]
114
+ optional: true
115
+ allow_ragged_batch: true
116
+ },
117
+ {
118
+ name: "embedding_bias"
119
+ data_type: TYPE_FP32
120
+ dims: [ -1 ]
121
+ optional: true
122
+ allow_ragged_batch: true
123
+ },
124
+ {
125
+ name: "beam_width"
126
+ data_type: TYPE_INT32
127
+ dims: [ 1 ]
128
+ reshape: { shape: [ ] }
129
+ optional: true
130
+ },
131
+ {
132
+ name: "temperature"
133
+ data_type: TYPE_FP32
134
+ dims: [ 1 ]
135
+ reshape: { shape: [ ] }
136
+ optional: true
137
+ },
138
+ {
139
+ name: "runtime_top_k"
140
+ data_type: TYPE_INT32
141
+ dims: [ 1 ]
142
+ reshape: { shape: [ ] }
143
+ optional: true
144
+ },
145
+ {
146
+ name: "runtime_top_p"
147
+ data_type: TYPE_FP32
148
+ dims: [ 1 ]
149
+ reshape: { shape: [ ] }
150
+ optional: true
151
+ },
152
+ {
153
+ name: "runtime_top_p_min"
154
+ data_type: TYPE_FP32
155
+ dims: [ 1 ]
156
+ reshape: { shape: [ ] }
157
+ optional: true
158
+ },
159
+ {
160
+ name: "runtime_top_p_decay"
161
+ data_type: TYPE_FP32
162
+ dims: [ 1 ]
163
+ reshape: { shape: [ ] }
164
+ optional: true
165
+ },
166
+ {
167
+ name: "runtime_top_p_reset_ids"
168
+ data_type: TYPE_INT32
169
+ dims: [ 1 ]
170
+ reshape: { shape: [ ] }
171
+ optional: true
172
+ },
173
+ {
174
+ name: "len_penalty"
175
+ data_type: TYPE_FP32
176
+ dims: [ 1 ]
177
+ reshape: { shape: [ ] }
178
+ optional: true
179
+ },
180
+ {
181
+ name: "early_stopping"
182
+ data_type: TYPE_BOOL
183
+ dims: [ 1 ]
184
+ reshape: { shape: [ ] }
185
+ optional: true
186
+ },
187
+ {
188
+ name: "repetition_penalty"
189
+ data_type: TYPE_FP32
190
+ dims: [ 1 ]
191
+ reshape: { shape: [ ] }
192
+ optional: true
193
+ },
194
+ {
195
+ name: "min_length"
196
+ data_type: TYPE_INT32
197
+ dims: [ 1 ]
198
+ reshape: { shape: [ ] }
199
+ optional: true
200
+ },
201
+ {
202
+ name: "beam_search_diversity_rate"
203
+ data_type: TYPE_FP32
204
+ dims: [ 1 ]
205
+ reshape: { shape: [ ] }
206
+ optional: true
207
+ },
208
+ {
209
+ name: "presence_penalty"
210
+ data_type: TYPE_FP32
211
+ dims: [ 1 ]
212
+ reshape: { shape: [ ] }
213
+ optional: true
214
+ },
215
+ {
216
+ name: "frequency_penalty"
217
+ data_type: TYPE_FP32
218
+ dims: [ 1 ]
219
+ reshape: { shape: [ ] }
220
+ optional: true
221
+ },
222
+ {
223
+ name: "random_seed"
224
+ data_type: TYPE_UINT64
225
+ dims: [ 1 ]
226
+ reshape: { shape: [ ] }
227
+ optional: true
228
+ },
229
+ {
230
+ name: "return_log_probs"
231
+ data_type: TYPE_BOOL
232
+ dims: [ 1 ]
233
+ reshape: { shape: [ ] }
234
+ optional: true
235
+ },
236
+ {
237
+ name: "return_context_logits"
238
+ data_type: TYPE_BOOL
239
+ dims: [ 1 ]
240
+ reshape: { shape: [ ] }
241
+ optional: true
242
+ },
243
+ {
244
+ name: "return_generation_logits"
245
+ data_type: TYPE_BOOL
246
+ dims: [ 1 ]
247
+ reshape: { shape: [ ] }
248
+ optional: true
249
+ },
250
+ {
251
+ name: "stop"
252
+ data_type: TYPE_BOOL
253
+ dims: [ 1 ]
254
+ optional: true
255
+ },
256
+ {
257
+ name: "streaming"
258
+ data_type: TYPE_BOOL
259
+ dims: [ 1 ]
260
+ optional: true
261
+ },
262
+ {
263
+ name: "prompt_embedding_table"
264
+ data_type: TYPE_FP16
265
+ dims: [ -1, -1 ]
266
+ optional: true
267
+ allow_ragged_batch: true
268
+ },
269
+ {
270
+ name: "prompt_vocab_size"
271
+ data_type: TYPE_INT32
272
+ dims: [ 1 ]
273
+ reshape: { shape: [ ] }
274
+ optional: true
275
+ },
276
+ # the unique task ID for the given LoRA.
277
+ # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
278
+ # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
279
+ # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached.
280
+ {
281
+ name: "lora_task_id"
282
+ data_type: TYPE_UINT64
283
+ dims: [ 1 ]
284
+ reshape: { shape: [ ] }
285
+ optional: true
286
+ },
287
+ # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
288
+ # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
289
+ # each of the in / out tensors are first flattened and then concatenated together in the format above.
290
+ # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
291
+ {
292
+ name: "lora_weights"
293
+ data_type: TYPE_FP16
294
+ dims: [ -1, -1 ]
295
+ optional: true
296
+ allow_ragged_batch: true
297
+ },
298
+ # module identifier (same size a first dimension of lora_weights)
299
+ # See LoraModule::ModuleType for model id mapping
300
+ #
301
+ # "attn_qkv": 0 # compbined qkv adapter
302
+ # "attn_q": 1 # q adapter
303
+ # "attn_k": 2 # k adapter
304
+ # "attn_v": 3 # v adapter
305
+ # "attn_dense": 4 # adapter for the dense layer in attention
306
+ # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
307
+ # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
308
+ # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
309
+ #
310
+ # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
311
+ {
312
+ name: "lora_config"
313
+ data_type: TYPE_INT32
314
+ dims: [ -1, 3 ]
315
+ optional: true
316
+ allow_ragged_batch: true
317
+ }
318
+ ]
319
+ output [
320
+ {
321
+ name: "output_ids"
322
+ data_type: TYPE_INT32
323
+ dims: [ -1, -1 ]
324
+ },
325
+ {
326
+ name: "sequence_length"
327
+ data_type: TYPE_INT32
328
+ dims: [ -1 ]
329
+ },
330
+ {
331
+ name: "cum_log_probs"
332
+ data_type: TYPE_FP32
333
+ dims: [ -1 ]
334
+ },
335
+ {
336
+ name: "output_log_probs"
337
+ data_type: TYPE_FP32
338
+ dims: [ -1, -1 ]
339
+ },
340
+ {
341
+ name: "context_logits"
342
+ data_type: TYPE_FP32
343
+ dims: [ -1, -1 ]
344
+ },
345
+ {
346
+ name: "generation_logits"
347
+ data_type: TYPE_FP32
348
+ dims: [ -1, -1, -1 ]
349
+ }
350
+ ]
351
+ instance_group [
352
+ {
353
+ count: 1
354
+ kind : KIND_CPU
355
+ }
356
+ ]
357
+ parameters: {
358
+ key: "max_beam_width"
359
+ value: {
360
+ string_value: "1"
361
+ }
362
+ }
363
+ parameters: {
364
+ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
365
+ value: {
366
+ string_value: "no"
367
+ }
368
+ }
369
+ parameters: {
370
+ key: "gpt_model_type"
371
+ value: {
372
+ string_value: "inflight_batching"
373
+ }
374
+ }
375
+ parameters: {
376
+ key: "gpt_model_path"
377
+ value: {
378
+ string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
379
+ }
380
+ }
381
+ parameters: {
382
+ key: "encoder_model_path"
383
+ value: {
384
+ string_value: "${encoder_engine_dir}"
385
+ }
386
+ }
387
+ parameters: {
388
+ key: "max_tokens_in_paged_kv_cache"
389
+ value: {
390
+ string_value: "${max_tokens_in_paged_kv_cache}"
391
+ }
392
+ }
393
+ parameters: {
394
+ key: "max_attention_window_size"
395
+ value: {
396
+ string_value: "${max_attention_window_size}"
397
+ }
398
+ }
399
+ parameters: {
400
+ key: "sink_token_length"
401
+ value: {
402
+ string_value: "${sink_token_length}"
403
+ }
404
+ }
405
+ parameters: {
406
+ key: "batch_scheduler_policy"
407
+ value: {
408
+ string_value: "guaranteed_no_evict"
409
+ }
410
+ }
411
+ parameters: {
412
+ key: "kv_cache_free_gpu_mem_fraction"
413
+ value: {
414
+ string_value: "0.8"
415
+ }
416
+ }
417
+ parameters: {
418
+ key: "kv_cache_host_memory_bytes"
419
+ value: {
420
+ string_value: "${kv_cache_host_memory_bytes}"
421
+ }
422
+ }
423
+ parameters: {
424
+ key: "kv_cache_onboard_blocks"
425
+ value: {
426
+ string_value: "${kv_cache_onboard_blocks}"
427
+ }
428
+ }
429
+ # enable_trt_overlap is deprecated and doesn't have any effect on the runtime
430
+ # parameters: {
431
+ # key: "enable_trt_overlap"
432
+ # value: {
433
+ # string_value: "${enable_trt_overlap}"
434
+ # }
435
+ # }
436
+ parameters: {
437
+ key: "exclude_input_in_output"
438
+ value: {
439
+ string_value: "true"
440
+ }
441
+ }
442
+ parameters: {
443
+ key: "cancellation_check_period_ms"
444
+ value: {
445
+ string_value: "${cancellation_check_period_ms}"
446
+ }
447
+ }
448
+ parameters: {
449
+ key: "stats_check_period_ms"
450
+ value: {
451
+ string_value: "${stats_check_period_ms}"
452
+ }
453
+ }
454
+ parameters: {
455
+ key: "iter_stats_max_iterations"
456
+ value: {
457
+ string_value: "${iter_stats_max_iterations}"
458
+ }
459
+ }
460
+ parameters: {
461
+ key: "request_stats_max_iterations"
462
+ value: {
463
+ string_value: "${request_stats_max_iterations}"
464
+ }
465
+ }
466
+ parameters: {
467
+ key: "enable_kv_cache_reuse"
468
+ value: {
469
+ string_value: "${enable_kv_cache_reuse}"
470
+ }
471
+ }
472
+ parameters: {
473
+ key: "normalize_log_probs"
474
+ value: {
475
+ string_value: "${normalize_log_probs}"
476
+ }
477
+ }
478
+ parameters: {
479
+ key: "enable_chunked_context"
480
+ value: {
481
+ string_value: "${enable_chunked_context}"
482
+ }
483
+ }
484
+ parameters: {
485
+ key: "gpu_device_ids"
486
+ value: {
487
+ string_value: "${gpu_device_ids}"
488
+ }
489
+ }
490
+ parameters: {
491
+ key: "lora_cache_optimal_adapter_size"
492
+ value: {
493
+ string_value: "${lora_cache_optimal_adapter_size}"
494
+ }
495
+ }
496
+ parameters: {
497
+ key: "lora_cache_max_adapter_size"
498
+ value: {
499
+ string_value: "${lora_cache_max_adapter_size}"
500
+ }
501
+ }
502
+ parameters: {
503
+ key: "lora_cache_gpu_memory_fraction"
504
+ value: {
505
+ string_value: "${lora_cache_gpu_memory_fraction}"
506
+ }
507
+ }
508
+ parameters: {
509
+ key: "lora_cache_host_memory_bytes"
510
+ value: {
511
+ string_value: "${lora_cache_host_memory_bytes}"
512
+ }
513
+ }
514
+ parameters: {
515
+ key: "decoding_mode"
516
+ value: {
517
+ string_value: "${decoding_mode}"
518
+ }
519
+ }
520
+ parameters: {
521
+ key: "executor_worker_path"
522
+ value: {
523
+ string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
524
+ }
525
+ }
526
+ parameters: {
527
+ key: "medusa_choices"
528
+ value: {
529
+ string_value: "${medusa_choices}"
530
+ }
531
+ }
532
+ parameters: {
533
+ key: "gpu_weights_percent"
534
+ value: {
535
+ string_value: "${gpu_weights_percent}"
536
+ }
537
+ }
tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (2.72 kB). View file
 
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc ADDED
Binary file (9.05 kB). View file
 
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc ADDED
Binary file (9.73 kB). View file
 
tensorrt_llm_bls/1/lib/decode.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ from collections.abc import Generator
28
+ from dataclasses import dataclass
29
+ from typing import Optional
30
+
31
+ import numpy as np
32
+
33
+
34
+ class RequestValidationError(Exception):
35
+ pass
36
+
37
+
38
+ def _validate_that(condition: bool, msg: str):
39
+ if not condition:
40
+ raise RequestValidationError(msg)
41
+
42
+
43
+ def _validate_non_empty(data, msg: str):
44
+ _validate_that(data is not None and data.size > 0, msg)
45
+
46
+
47
+ def _validate_single_gt_0(data, msg: str):
48
+ _validate_non_empty(data, msg)
49
+ _validate_that(data.flatten()[0] > 0, msg)
50
+
51
+
52
+ def _single_value(data: Optional[np.ndarray]):
53
+ if data is None:
54
+ return None
55
+ return data.flatten()[0]
56
+
57
+
58
+ @dataclass
59
+ class Request:
60
+ text_input: np.ndarray = np.array([])
61
+ decoder_text_input: np.ndarray = None
62
+ max_tokens: np.ndarray = np.array([])
63
+ bad_words: Optional[np.ndarray] = None
64
+ stop_words: Optional[np.ndarray] = None
65
+ end_id: Optional[np.ndarray] = None
66
+ pad_id: Optional[np.ndarray] = None
67
+ top_k: Optional[np.ndarray] = None
68
+ top_p: Optional[np.ndarray] = None
69
+ temperature: Optional[np.ndarray] = None
70
+ length_penalty: Optional[np.ndarray] = None
71
+ repetition_penalty: Optional[np.ndarray] = None
72
+ min_length: Optional[np.ndarray] = None
73
+ return_log_probs: Optional[np.ndarray] = None
74
+ prompt_embedding_table: Optional[np.ndarray] = None
75
+ prompt_vocab_size: Optional[np.ndarray] = None
76
+ embedding_bias_words: Optional[np.ndarray] = None
77
+ embedding_bias_weights: Optional[np.ndarray] = None
78
+ num_draft_tokens: Optional[np.ndarray] = None
79
+ use_draft_logits: Optional[np.ndarray] = None
80
+ stream: Optional[np.ndarray] = None
81
+ beam_width: Optional[np.ndarray] = None
82
+ return_context_logits: Optional[np.ndarray] = None
83
+ return_generation_logits: Optional[np.ndarray] = None
84
+ random_seed: Optional[np.ndarray] = None
85
+ presence_penalty: Optional[np.ndarray] = None
86
+ frequency_penalty: Optional[np.ndarray] = None
87
+
88
+ def validate(self):
89
+ _validate_non_empty(self.text_input, "text_input is required")
90
+ _validate_single_gt_0(self.max_tokens,
91
+ "max_tokens must be a single value > 0")
92
+
93
+ num_draft_tokens = _single_value(self.num_draft_tokens)
94
+ stream = _single_value(self.stream)
95
+ _single_value(self.return_generation_logits)
96
+ context_logits = _single_value(self.return_context_logits)
97
+
98
+ if num_draft_tokens:
99
+ _validate_that(
100
+ not stream,
101
+ "streaming is not supported with speculative decoding")
102
+ _validate_that(
103
+ not context_logits,
104
+ "context logits are not supported with speculative decoding")
105
+
106
+
107
+ @dataclass
108
+ class DraftRequest:
109
+ draft_input_ids: Optional[np.ndarray] = None
110
+ draft_logits: Optional[np.ndarray] = None
111
+
112
+
113
+ @dataclass
114
+ class PreprocResponse:
115
+ input_ids: np.ndarray = np.array([])
116
+ decoder_input_ids: np.ndarray = None
117
+ input_lengths: np.ndarray = np.array([])
118
+ decoder_input_lengths: np.ndarray = None
119
+ bad_words_list: Optional[np.ndarray] = None
120
+ stop_words_list: Optional[np.ndarray] = None
121
+ embedding_bias: Optional[np.ndarray] = None
122
+ end_id: Optional[np.ndarray] = None
123
+ pad_id: Optional[np.ndarray] = None
124
+
125
+ @classmethod
126
+ def with_new_inputs(cls,
127
+ other,
128
+ input_ids: Optional[np.ndarray] = None,
129
+ input_lengths: Optional[np.ndarray] = None):
130
+ return cls(
131
+ input_ids=(input_ids
132
+ if input_ids is not None else other.input_ids),
133
+ input_lengths=(input_lengths if input_lengths is not None else
134
+ other.input_lengths),
135
+ decoder_input_ids=other.decoder_input_ids,
136
+ decoder_input_lengths=other.decoder_input_lengths,
137
+ bad_words_list=other.bad_words_list,
138
+ stop_words_list=other.stop_words_list,
139
+ end_id=other.end_id,
140
+ pad_id=other.pad_id,
141
+ )
142
+
143
+
144
+ @dataclass
145
+ class GenerationResponse:
146
+ output_ids: np.ndarray = np.array([])
147
+ sequence_length: np.ndarray = np.array([])
148
+ cum_log_probs: Optional[np.ndarray] = None
149
+ output_log_probs: Optional[np.ndarray] = None
150
+ context_logits: Optional[np.ndarray] = None
151
+ generation_logits: Optional[np.ndarray] = None
152
+
153
+
154
+ @dataclass
155
+ class Response:
156
+ text_output: np.ndarray = np.array([])
157
+ cum_log_probs: Optional[np.ndarray] = None
158
+ output_log_probs: Optional[np.ndarray] = None
159
+ context_logits: Optional[np.ndarray] = None
160
+ generation_logits: Optional[np.ndarray] = None
161
+
162
+ def __eq__(self, o) -> bool:
163
+ """Just for testing"""
164
+ if not isinstance(o, Response):
165
+ return False
166
+ return (np.array_equal(self.text_output, o.text_output)
167
+ and np.array_equal(self.cum_log_probs, o.cum_log_probs)
168
+ and np.array_equal(self.output_log_probs, o.output_log_probs)
169
+ and np.array_equal(self.context_logits, o.context_logits) and
170
+ np.array_equal(self.generation_logits, o.generation_logits))
171
+
172
+
173
+ class Decoder:
174
+
175
+ def __init__(self, streaming=False, accumulate=False):
176
+ self._streaming = streaming
177
+ self._accumulate = accumulate
178
+
179
+ self._accumulated_tokens = None
180
+
181
+ def decode(self,
182
+ request: Request,
183
+ speculative_decoding=False) -> Generator[Response, None, None]:
184
+ preproc_response = self.preprocess(request)
185
+
186
+ if speculative_decoding:
187
+ for gen_response in self._spec_generate(preproc_response, request):
188
+ yield self.postprocess(gen_response)
189
+ else:
190
+ if not self._streaming:
191
+ gen_response = self._generate_non_streaming(
192
+ preproc_response, request)
193
+ yield self.postprocess(gen_response)
194
+ else:
195
+ for gen_response in self._generate(preproc_response, request):
196
+ yield self.postprocess(gen_response)
197
+
198
+ def encountered_stop_words(self, input_ids, stop_words_ids):
199
+ for stop_word_ids in stop_words_ids:
200
+ if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
201
+ return True
202
+ return False
203
+
204
+ def _spec_generate(
205
+ self, preproc: PreprocResponse,
206
+ request: Request) -> Generator[GenerationResponse, None, None]:
207
+
208
+ prompt_input_ids: np.ndarray = preproc.input_ids[0]
209
+ input_ids: np.ndarray = prompt_input_ids
210
+ output_len: int = request.max_tokens[0][0]
211
+ last_input_ids: np.ndarray = None
212
+ draft_output_ids: np.ndarray = None
213
+ draft_logits: np.ndarray = None
214
+
215
+ target_response: GenerationResponse = None
216
+
217
+ cur_preproc = preproc
218
+
219
+ counter = 0
220
+ while True:
221
+ counter += 1
222
+ num_draft_tokens = min(
223
+ request.num_draft_tokens[0][0],
224
+ len(prompt_input_ids) + output_len - len(input_ids) - 1)
225
+
226
+ draft_request = None
227
+ if num_draft_tokens > 0:
228
+ draft_response: GenerationResponse = self._draft_generate_non_streaming(
229
+ cur_preproc, request, num_draft_tokens)
230
+ seq_len: int = draft_response.sequence_length[0][0]
231
+ # [1, beamWidth, outputLength] -> [outputLen]
232
+ draft_output_ids = draft_response.output_ids[0][0]
233
+ # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
234
+ if request.use_draft_logits is not None and request.use_draft_logits[
235
+ 0]:
236
+ if draft_response.generation_logits is not None:
237
+ draft_logits = draft_response.generation_logits[0][0]
238
+
239
+ input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
240
+ draft_request = DraftRequest(
241
+ draft_input_ids=np.expand_dims(input_draft_tokens, 0))
242
+ if request.use_draft_logits is not None and request.use_draft_logits[
243
+ 0]:
244
+ draft_request.draft_logits = np.expand_dims(
245
+ draft_logits[-len(input_draft_tokens):], 0)
246
+ else:
247
+ draft_request = DraftRequest()
248
+ target_response = self._generate_non_streaming(
249
+ cur_preproc, request, draft_request)
250
+ last_input_ids = input_ids
251
+ input_ids = target_response.output_ids[0][0]
252
+ cur_preproc = PreprocResponse.with_new_inputs(
253
+ cur_preproc, np.expand_dims(input_ids, 0),
254
+ np.array([[len(input_ids)]], dtype=np.int32))
255
+
256
+ # Evaluate criteria to stop generation loop.
257
+ # If we've hit or exceeded the max output length, should stop
258
+ length_stop = (len(input_ids) >=
259
+ len(prompt_input_ids) + output_len)
260
+ if length_stop:
261
+ break
262
+ # If draft and target have same outputs, should stop. Normally target should return 1 more token.
263
+ # If they are the same length, they should differ at the last token
264
+ target_draft_equal = draft_output_ids is not None and np.array_equal(
265
+ draft_output_ids, input_ids)
266
+ if target_draft_equal:
267
+ break
268
+ # If tokens no longer change, should stop, means we have hit early stopping
269
+ last_current_equal = np.array_equal(last_input_ids, input_ids)
270
+ if last_current_equal:
271
+ break
272
+ # Need to check if stop words was encountered
273
+ hit_stop_words = self.encountered_stop_words(
274
+ input_ids, preproc.stop_words_list[0])
275
+ if hit_stop_words:
276
+ break
277
+
278
+ yield target_response
279
+
280
+ def _draft_generate_non_streaming(
281
+ self, preproc: PreprocResponse, request: Request,
282
+ num_draft_tokens: int) -> GenerationResponse:
283
+ raise NotImplementedError()
284
+
285
+ def _generate(
286
+ self,
287
+ preproc: PreprocResponse,
288
+ request: Request,
289
+ draft_request: Optional[DraftRequest] = None
290
+ ) -> Generator[GenerationResponse, None, None]:
291
+ raise NotImplementedError()
292
+
293
+ def _generate_non_streaming(
294
+ self,
295
+ preproc: PreprocResponse,
296
+ request: Request,
297
+ draft_request: Optional[DraftRequest] = None
298
+ ) -> GenerationResponse:
299
+ raise NotImplementedError()
300
+
301
+ def postprocess(self, gen_response: GenerationResponse) -> Response:
302
+ if self._accumulate and self._streaming:
303
+ new_tokens: np.ndarray = gen_response.output_ids
304
+ if new_tokens.ndim != 3:
305
+ raise Exception("Expected output_ids tensor to have 3 dims.")
306
+ if new_tokens.shape[0] != 1:
307
+ raise Exception("Expected batch size of 1")
308
+ if new_tokens.shape[1] != 1:
309
+ raise Exception(
310
+ "Accumulation of tokens is only implemented for beam width = 1"
311
+ )
312
+
313
+ self._accumulated_tokens = new_tokens if (
314
+ self._accumulated_tokens is None) else np.concatenate(
315
+ (self._accumulated_tokens, new_tokens), axis=2)
316
+ sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]],
317
+ dtype=np.int32)
318
+ return self._postprocess(self._accumulated_tokens,
319
+ sequence_lengths, gen_response)
320
+ else:
321
+ return self._postprocess(gen_response.output_ids, None,
322
+ gen_response)
323
+
324
+ def _postprocess(self, tokens: np.ndarray,
325
+ sequence_lengths: Optional[np.ndarray],
326
+ gen_response: GenerationResponse) -> Response:
327
+ raise NotImplementedError()
328
+
329
+ def preprocess(self, request: Request) -> PreprocResponse:
330
+ raise NotImplementedError()
331
+
332
+ def reset_decoder(self):
333
+ self._accumulated_tokens = None
tensorrt_llm_bls/1/lib/triton_decoder.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ from collections.abc import Callable
28
+ from typing import Dict, Optional
29
+
30
+ import numpy as np
31
+ import triton_python_backend_utils as pb_utils
32
+ from lib.decode import *
33
+ from typing_extensions import override
34
+
35
+
36
+ class TritonDecoder(Decoder):
37
+
38
+ def __init__(self,
39
+ streaming=False,
40
+ accumulate=False,
41
+ preproc_model_name="preprocessing",
42
+ postproc_model_name="postprocessing",
43
+ llm_model_name="tensorrt_llm",
44
+ draft_llm_model_name: Optional[str] = None):
45
+ super().__init__(streaming=streaming, accumulate=accumulate)
46
+ self.preproc_model_name = preproc_model_name
47
+ self.postproc_model_name = postproc_model_name
48
+ self.llm_model_name = llm_model_name
49
+ self.draft_llm_model_name = draft_llm_model_name
50
+
51
+ self._preproc_outputs = [
52
+ "INPUT_ID",
53
+ "DECODER_INPUT_ID",
54
+ "REQUEST_INPUT_LEN",
55
+ "REQUEST_DECODER_INPUT_LEN",
56
+ "BAD_WORDS_IDS",
57
+ "STOP_WORDS_IDS",
58
+ "EMBEDDING_BIAS",
59
+ "OUT_PAD_ID",
60
+ "OUT_END_ID",
61
+ ]
62
+
63
+ self._llm_outputs = [
64
+ "output_ids",
65
+ "sequence_length",
66
+ "cum_log_probs",
67
+ "output_log_probs",
68
+ "context_logits",
69
+ "generation_logits",
70
+ ]
71
+
72
+ self._postproc_outputs = [
73
+ "OUTPUT",
74
+ ]
75
+
76
+ self.input_names = [
77
+ "text_input",
78
+ "decoder_text_input",
79
+ "max_tokens",
80
+ "bad_words",
81
+ "stop_words",
82
+ "end_id",
83
+ "pad_id",
84
+ "top_k",
85
+ "top_p",
86
+ "temperature",
87
+ "length_penalty",
88
+ "repetition_penalty",
89
+ "min_length",
90
+ "presence_penalty",
91
+ "frequency_penalty",
92
+ "random_seed",
93
+ "return_log_probs",
94
+ "return_context_logits",
95
+ "return_generation_logits",
96
+ "beam_width",
97
+ "stream",
98
+ "prompt_embedding_table",
99
+ "prompt_vocab_size",
100
+ "embedding_bias_words",
101
+ "embedding_bias_weights",
102
+ "num_draft_tokens",
103
+ "use_draft_logits",
104
+ ]
105
+
106
+ self.__undo_reshape_whitelist = {
107
+ "max_tokens",
108
+ "end_id",
109
+ "pad_id",
110
+ "top_k",
111
+ "top_p",
112
+ "temperature",
113
+ "length_penalty",
114
+ "repetition_penalty",
115
+ "min_length",
116
+ "presence_penalty",
117
+ "frequency_penalty",
118
+ "random_seed",
119
+ "return_log_probs",
120
+ "return_context_logits",
121
+ "return_generation_logits",
122
+ "beam_width",
123
+ "stream",
124
+ "prompt_vocab_size",
125
+ "num_draft_tokens",
126
+ "use_draft_logits",
127
+ }
128
+
129
+ def _exec_triton_request(self, request):
130
+ responses = request.exec(decoupled=True)
131
+ for r in responses:
132
+ if r.has_error():
133
+ raise pb_utils.TritonModelException(r.error().message())
134
+ yield r
135
+
136
+ def _exec_triton_request_single(self, request):
137
+ responses = request.exec(decoupled=False)
138
+ if responses.has_error():
139
+ raise pb_utils.TritonModelException(responses.error().message())
140
+ return responses
141
+
142
+ def create_triton_response(self, response: Response):
143
+ name_map = {
144
+ "text_output": "text_output",
145
+ "cum_log_probs": "cum_log_probs",
146
+ "output_log_probs": "output_log_probs",
147
+ "context_logits": "context_logits",
148
+ "generation_logits": "generation_logits"
149
+ }
150
+ tensors = self.create_triton_tensors(response, name_map)
151
+ return pb_utils.InferenceResponse(output_tensors=tensors)
152
+
153
+ def convert_triton_request(self, triton_request) -> Request:
154
+ request = Request()
155
+ for triton_name in self.input_names:
156
+ tensor = pb_utils.get_input_tensor_by_name(triton_request,
157
+ triton_name)
158
+ target_name = triton_name
159
+ if tensor is None:
160
+ continue
161
+ if not hasattr(request, target_name):
162
+ raise AttributeError(
163
+ f"Request has no attribute '{target_name}'")
164
+ setattr(request, target_name, tensor.as_numpy())
165
+ return request
166
+
167
+ def convert_triton_response(self,
168
+ triton_response,
169
+ response_factory: Callable,
170
+ name_map=None):
171
+ response = response_factory()
172
+ for tensor in triton_response.output_tensors():
173
+ if tensor is None:
174
+ continue
175
+ triton_name = tensor.name()
176
+ value = tensor.as_numpy()
177
+ target_name = triton_name
178
+ if name_map and triton_name in name_map:
179
+ target_name = name_map[triton_name]
180
+ if name_map and not triton_name in name_map:
181
+ continue
182
+ if target_name is None:
183
+ # explicitly ignore this triton input
184
+ continue
185
+ if not hasattr(response, target_name):
186
+ raise AttributeError(
187
+ f"response object has not attribute '{target_name}'")
188
+ setattr(response, target_name, value)
189
+ return response
190
+
191
+ def __undo_reshape(self, x, name):
192
+ if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
193
+ # handle reshapes
194
+ return np.expand_dims(x, 0)
195
+ else:
196
+ return x
197
+
198
+ def create_triton_tensors(self, obj, name_map: dict):
199
+ tensors = []
200
+ for name, triton_name in name_map.items():
201
+ if triton_name is None:
202
+ continue
203
+ value = getattr(obj, name)
204
+ if value is None:
205
+ continue
206
+ t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name))
207
+ tensors.append(t)
208
+ return tensors
209
+
210
+ @override
211
+ def preprocess(self, request: Request) -> PreprocResponse:
212
+ input_tensors = self._get_preproc_tensors(request)
213
+ triton_req = pb_utils.InferenceRequest(
214
+ model_name=self.preproc_model_name,
215
+ inputs=input_tensors,
216
+ requested_output_names=self._preproc_outputs)
217
+ triton_output = self._exec_triton_request_single(triton_req)
218
+ return self._get_preproc_response(triton_output)
219
+
220
+ def _get_preproc_tensors(self, request: Request):
221
+ name_map = {
222
+ "text_input": "QUERY",
223
+ "decoder_text_input": "DECODER_QUERY",
224
+ "max_tokens": "REQUEST_OUTPUT_LEN",
225
+ "bad_words": "BAD_WORDS_DICT",
226
+ "stop_words": "STOP_WORDS_DICT",
227
+ "embedding_bias_words": "EMBEDDING_BIAS_WORDS",
228
+ "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
229
+ "pad_id": "PAD_ID",
230
+ "end_id": "END_ID",
231
+ }
232
+ return self.create_triton_tensors(request, name_map)
233
+
234
+ def _get_preproc_response(self, triton_output):
235
+ name_map = {
236
+ "INPUT_ID": "input_ids",
237
+ "DECODER_INPUT_ID": "decoder_input_ids",
238
+ "REQUEST_INPUT_LEN": "input_lengths",
239
+ "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
240
+ "BAD_WORDS_IDS": "bad_words_list",
241
+ "STOP_WORDS_IDS": "stop_words_list",
242
+ "EMBEDDING_BIAS": "embedding_bias",
243
+ "OUT_PAD_ID": "pad_id",
244
+ "OUT_END_ID": "end_id",
245
+ }
246
+ return self.convert_triton_response(triton_output, PreprocResponse,
247
+ name_map)
248
+
249
+ @override
250
+ def _draft_generate_non_streaming(
251
+ self, preproc: PreprocResponse, request: Request,
252
+ num_draft_tokens: int) -> GenerationResponse:
253
+ input_tensors = self._get_llm_tensors(preproc, request,
254
+ num_draft_tokens, None, True)
255
+ triton_req = pb_utils.InferenceRequest(
256
+ model_name=self.draft_llm_model_name,
257
+ inputs=input_tensors,
258
+ requested_output_names=self._llm_outputs)
259
+ triton_response = self._exec_triton_request_single(triton_req)
260
+ llm_response = self._get_llm_response(triton_response)
261
+ return llm_response
262
+
263
+ @override
264
+ def _generate(
265
+ self,
266
+ preproc: PreprocResponse,
267
+ request: Request,
268
+ draft_request: Optional[DraftRequest] = None
269
+ ) -> Generator[GenerationResponse, None, None]:
270
+ input_tensors = self._get_llm_tensors(preproc, request, None,
271
+ draft_request)
272
+ triton_req = pb_utils.InferenceRequest(
273
+ model_name=self.llm_model_name,
274
+ inputs=input_tensors,
275
+ requested_output_names=self._llm_outputs)
276
+ for r in self._exec_triton_request(triton_req):
277
+ yield self._get_llm_response(r)
278
+
279
+ @override
280
+ def _generate_non_streaming(
281
+ self,
282
+ preproc: PreprocResponse,
283
+ request: Request,
284
+ draft_request: Optional[DraftRequest] = None
285
+ ) -> GenerationResponse:
286
+ input_tensors = self._get_llm_tensors(preproc, request, None,
287
+ draft_request)
288
+ triton_req = pb_utils.InferenceRequest(
289
+ model_name=self.llm_model_name,
290
+ inputs=input_tensors,
291
+ requested_output_names=self._llm_outputs)
292
+ r = self._exec_triton_request_single(triton_req)
293
+ return self._get_llm_response(r)
294
+
295
+ def _get_llm_tensors(self,
296
+ preproc: PreprocResponse,
297
+ request: Request,
298
+ num_output_tokens: Optional[int] = None,
299
+ draft_request: Optional[DraftRequest] = None,
300
+ is_draft_model_request: bool = False):
301
+ tensors = []
302
+ tensors.extend(self._get_tensors_from_preproc(preproc))
303
+ tensors.extend(
304
+ self._get_llm_tensors_from_request(request, num_output_tokens,
305
+ draft_request,
306
+ is_draft_model_request))
307
+ return tensors
308
+
309
+ def _get_tensors_from_preproc(self, preproc: PreprocResponse):
310
+ name_map = {
311
+ "input_ids": "input_ids",
312
+ "decoder_input_ids": "decoder_input_ids",
313
+ "input_lengths": "input_lengths",
314
+ "bad_words_list": "bad_words_list",
315
+ "stop_words_list": "stop_words_list",
316
+ "embedding_bias": "embedding_bias",
317
+ "pad_id": "pad_id",
318
+ "end_id": "end_id",
319
+ }
320
+ return self.create_triton_tensors(preproc, name_map)
321
+
322
+ def _get_llm_tensors_from_request(
323
+ self,
324
+ request: Request,
325
+ num_output_tokens: Optional[int] = None,
326
+ draft_request: Optional[DraftRequest] = None,
327
+ is_draft_model_request: bool = False):
328
+ name_map: Dict[str, Optional[str]] = {
329
+ "beam_width": "beam_width",
330
+ "top_k": "runtime_top_k",
331
+ "top_p": "runtime_top_p",
332
+ "length_penalty": "len_penalty",
333
+ "repetition_penalty": "repetition_penalty",
334
+ "min_length": "min_length",
335
+ "presence_penalty": "presence_penalty",
336
+ "frequency_penalty": "frequency_penalty",
337
+ "random_seed": "random_seed",
338
+ "return_log_probs": "return_log_probs",
339
+ "stream": "streaming",
340
+ "prompt_embedding_table": "prompt_embedding_table",
341
+ "prompt_vocab_size": "prompt_vocab_size",
342
+ }
343
+ tensors = self.create_triton_tensors(request, name_map)
344
+
345
+ out_len = request.max_tokens[0][0] if request.max_tokens else None
346
+ if num_output_tokens is not None:
347
+ out_len = num_output_tokens
348
+ elif draft_request:
349
+ if draft_request.draft_input_ids is not None:
350
+ out_len = len(draft_request.draft_input_ids[0]) + 1
351
+ else:
352
+ out_len = 1
353
+
354
+ if out_len is None:
355
+ raise Exception("Could not determine request_output_len")
356
+ else:
357
+ tensors.append(
358
+ pb_utils.Tensor("request_output_len",
359
+ np.array([[out_len]], dtype=np.int32)))
360
+
361
+ if draft_request:
362
+ if draft_request.draft_input_ids is not None:
363
+ tensors.append(
364
+ pb_utils.Tensor("draft_input_ids",
365
+ draft_request.draft_input_ids))
366
+ if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
367
+ 0]:
368
+ tensors.append(
369
+ pb_utils.Tensor("draft_logits",
370
+ draft_request.draft_logits))
371
+
372
+ return_context_logits = False
373
+ return_generation_logits = False
374
+ if draft_request is None:
375
+ if is_draft_model_request:
376
+ return_generation_logits = request.use_draft_logits[
377
+ 0] if request.use_draft_logits is not None else False
378
+ else:
379
+ return_context_logits = request.return_context_logits[
380
+ 0] if request.return_context_logits is not None else False
381
+ return_generation_logits = request.return_generation_logits[
382
+ 0] if request.return_generation_logits is not None else False
383
+
384
+ tensors.append(
385
+ pb_utils.Tensor("return_context_logits",
386
+ np.array([[return_context_logits]])))
387
+ tensors.append(
388
+ pb_utils.Tensor("return_generation_logits",
389
+ np.array([[return_generation_logits]])))
390
+ return tensors
391
+
392
+ def _get_llm_response(self, triton_output):
393
+ name_map = {
394
+ "output_ids": "output_ids",
395
+ "sequence_length": "sequence_length",
396
+ "cum_log_probs": "cum_log_probs",
397
+ "output_log_probs": "output_log_probs",
398
+ "context_logits": "context_logits",
399
+ "generation_logits": "generation_logits",
400
+ }
401
+ return self.convert_triton_response(triton_output, GenerationResponse,
402
+ name_map)
403
+
404
+ def _postprocess(self, tokens: np.ndarray,
405
+ sequence_lengths: Optional[np.ndarray],
406
+ gen_response: GenerationResponse) -> Response:
407
+ input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
408
+ gen_response)
409
+ triton_req = pb_utils.InferenceRequest(
410
+ model_name=self.postproc_model_name,
411
+ inputs=input_tensors,
412
+ requested_output_names=self._postproc_outputs)
413
+ r = self._exec_triton_request_single(triton_req)
414
+ response = self._get_response(r, gen_response)
415
+ return response
416
+
417
+ def _get_postproc_tensors(self, tokens: np.ndarray,
418
+ sequence_lengths: Optional[np.ndarray],
419
+ gen_response: GenerationResponse):
420
+ tensors = [
421
+ pb_utils.Tensor("TOKENS_BATCH", tokens),
422
+ pb_utils.Tensor(
423
+ "SEQUENCE_LENGTH", sequence_lengths
424
+ if sequence_lengths else gen_response.sequence_length)
425
+ ]
426
+ return tensors
427
+
428
+ def _get_response(self, triton_output, gen_res: GenerationResponse):
429
+ tensors = triton_output.output_tensors()
430
+ t_map = {}
431
+ for named_t in tensors:
432
+ name = named_t.name()
433
+ t = named_t.as_numpy()
434
+ t_map[name] = t
435
+ response = Response(text_output=t_map["OUTPUT"],
436
+ cum_log_probs=gen_res.cum_log_probs,
437
+ output_log_probs=gen_res.output_log_probs,
438
+ context_logits=gen_res.context_logits,
439
+ generation_logits=gen_res.generation_logits)
440
+ return response
tensorrt_llm_bls/1/model.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ import json
28
+ import traceback
29
+
30
+ import triton_python_backend_utils as pb_utils
31
+ from lib.triton_decoder import TritonDecoder
32
+
33
+
34
+ class TritonPythonModel:
35
+
36
+ def initialize(self, args):
37
+
38
+ # Parse model configs
39
+ model_config = json.loads(args['model_config'])
40
+
41
+ params = model_config['parameters']
42
+
43
+ accumulate_tokens_str = ''
44
+ if 'accumulate_tokens' in params:
45
+ accumulate_tokens_str = params['accumulate_tokens']['string_value']
46
+
47
+ self.accumulate_tokens = accumulate_tokens_str.lower() in [
48
+ 'true', 'yes', '1', 't'
49
+ ]
50
+
51
+ self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
52
+ model_config)
53
+
54
+ self.logger = pb_utils.Logger
55
+
56
+ self.llm_model_name = "tensorrt_llm"
57
+ if "tensorrt_llm_model_name" in params:
58
+ self.llm_model_name = params["tensorrt_llm_model_name"][
59
+ "string_value"]
60
+ self.draft_llm_model_name = None
61
+ if "tensorrt_llm_draft_model_name" in params:
62
+ self.draft_llm_model_name = params[
63
+ "tensorrt_llm_draft_model_name"]["string_value"]
64
+
65
+ self.decoder = TritonDecoder(
66
+ streaming=self.decoupled,
67
+ accumulate=self.accumulate_tokens,
68
+ preproc_model_name="preprocessing",
69
+ postproc_model_name="postprocessing",
70
+ llm_model_name=self.llm_model_name,
71
+ draft_llm_model_name=self.draft_llm_model_name)
72
+
73
+ def execute(self, requests):
74
+
75
+ responses = []
76
+
77
+ for request in requests:
78
+ if self.decoupled:
79
+ response_sender = request.get_response_sender()
80
+ try:
81
+
82
+ req = self.decoder.convert_triton_request(request)
83
+ req.validate()
84
+ speculative_decode = (req.num_draft_tokens is not None
85
+ and req.num_draft_tokens[0][0] > 0)
86
+ if speculative_decode and (self.draft_llm_model_name is None
87
+ or self.draft_llm_model_name == ""):
88
+ raise Exception(
89
+ "cannot perform speculative decoding without draft model"
90
+ )
91
+ res_gen = self.decoder.decode(
92
+ req, speculative_decoding=speculative_decode)
93
+
94
+ for res in res_gen:
95
+ triton_response = self.decoder.create_triton_response(res)
96
+ if self.decoupled:
97
+ response_sender.send(triton_response)
98
+ else:
99
+ responses.append(triton_response)
100
+
101
+ if self.decoupled:
102
+ response_sender.send(
103
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
104
+
105
+ except Exception:
106
+ self.logger.log_error(traceback.format_exc())
107
+ # If encountering an error, send a response with err msg
108
+ error_response = pb_utils.InferenceResponse(
109
+ output_tensors=[],
110
+ error=pb_utils.TritonError(traceback.format_exc()))
111
+
112
+ if self.decoupled:
113
+ response_sender.send(error_response)
114
+ response_sender.send(
115
+ flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
116
+ else:
117
+ responses.append(error_response)
118
+
119
+ self.decoder.reset_decoder()
120
+ if self.decoupled:
121
+ return None
122
+ else:
123
+ assert len(responses) == len(requests)
124
+ return responses
125
+
126
+ def finalize(self):
127
+ """`finalize` is called only once when the model is being unloaded.
128
+ Implementing `finalize` function is optional. This function allows
129
+ the model to perform any necessary clean ups before exit.
130
+ """
131
+ print('Cleaning up...')
tensorrt_llm_bls/config.pbtxt ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions
5
+ # are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
12
+ # contributors may be used to endorse or promote products derived
13
+ # from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+
27
+ name: "tensorrt_llm_bls"
28
+ backend: "python"
29
+ max_batch_size: 16
30
+
31
+ model_transaction_policy {
32
+ decoupled: true
33
+ }
34
+
35
+ input [
36
+ {
37
+ name: "text_input"
38
+ data_type: TYPE_STRING
39
+ dims: [ -1 ]
40
+ },
41
+ {
42
+ name: "decoder_text_input"
43
+ data_type: TYPE_STRING
44
+ dims: [ -1 ]
45
+ optional: true
46
+ },
47
+ {
48
+ name: "max_tokens"
49
+ data_type: TYPE_INT32
50
+ dims: [ -1 ]
51
+ },
52
+ {
53
+ name: "bad_words"
54
+ data_type: TYPE_STRING
55
+ dims: [ -1 ]
56
+ optional: true
57
+ },
58
+ {
59
+ name: "stop_words"
60
+ data_type: TYPE_STRING
61
+ dims: [ -1 ]
62
+ optional: true
63
+ },
64
+ {
65
+ name: "end_id"
66
+ data_type: TYPE_INT32
67
+ dims: [ 1 ]
68
+ optional: true
69
+ },
70
+ {
71
+ name: "pad_id"
72
+ data_type: TYPE_INT32
73
+ dims: [ 1 ]
74
+ optional: true
75
+ },
76
+ {
77
+ name: "top_k"
78
+ data_type: TYPE_INT32
79
+ dims: [ 1 ]
80
+ optional: true
81
+ },
82
+ {
83
+ name: "top_p"
84
+ data_type: TYPE_FP32
85
+ dims: [ 1 ]
86
+ optional: true
87
+ },
88
+ {
89
+ name: "temperature"
90
+ data_type: TYPE_FP32
91
+ dims: [ 1 ]
92
+ optional: true
93
+ },
94
+ {
95
+ name: "length_penalty"
96
+ data_type: TYPE_FP32
97
+ dims: [ 1 ]
98
+ optional: true
99
+ },
100
+ {
101
+ name: "repetition_penalty"
102
+ data_type: TYPE_FP32
103
+ dims: [ 1 ]
104
+ optional: true
105
+ },
106
+ {
107
+ name: "min_length"
108
+ data_type: TYPE_INT32
109
+ dims: [ 1 ]
110
+ optional: true
111
+ },
112
+ {
113
+ name: "presence_penalty"
114
+ data_type: TYPE_FP32
115
+ dims: [ 1 ]
116
+ optional: true
117
+ },
118
+ {
119
+ name: "frequency_penalty"
120
+ data_type: TYPE_FP32
121
+ dims: [ 1 ]
122
+ optional: true
123
+ },
124
+ {
125
+ name: "random_seed"
126
+ data_type: TYPE_UINT64
127
+ dims: [ 1 ]
128
+ optional: true
129
+ },
130
+ {
131
+ name: "return_log_probs"
132
+ data_type: TYPE_BOOL
133
+ dims: [ 1 ]
134
+ reshape: { shape: [ ] }
135
+ optional: true
136
+ },
137
+ {
138
+ name: "return_context_logits"
139
+ data_type: TYPE_BOOL
140
+ dims: [ 1 ]
141
+ reshape: { shape: [ ] }
142
+ optional: true
143
+ },
144
+ {
145
+ name: "return_generation_logits"
146
+ data_type: TYPE_BOOL
147
+ dims: [ 1 ]
148
+ reshape: { shape: [ ] }
149
+ optional: true
150
+ },
151
+ {
152
+ name: "beam_width"
153
+ data_type: TYPE_INT32
154
+ dims: [ 1 ]
155
+ optional: true
156
+ },
157
+ {
158
+ name: "stream"
159
+ data_type: TYPE_BOOL
160
+ dims: [ 1 ]
161
+ optional: true
162
+ },
163
+ {
164
+ name: "prompt_embedding_table"
165
+ data_type: TYPE_FP16
166
+ dims: [ -1, -1 ]
167
+ optional: true
168
+ },
169
+ {
170
+ name: "prompt_vocab_size"
171
+ data_type: TYPE_INT32
172
+ dims: [ 1 ]
173
+ optional: true
174
+ },
175
+ {
176
+ name: "embedding_bias_words"
177
+ data_type: TYPE_STRING
178
+ dims: [ -1 ]
179
+ optional: true
180
+ },
181
+ {
182
+ name: "embedding_bias_weights"
183
+ data_type: TYPE_FP32
184
+ dims: [ -1 ]
185
+ optional: true
186
+ },
187
+ {
188
+ name: "num_draft_tokens",
189
+ data_type: TYPE_INT32,
190
+ dims: [ 1 ]
191
+ optional: true
192
+ },
193
+ {
194
+ name: "use_draft_logits",
195
+ data_type: TYPE_BOOL,
196
+ dims: [ 1 ]
197
+ reshape: { shape: [ ] }
198
+ optional: true
199
+ }
200
+ ]
201
+ output [
202
+ {
203
+ name: "text_output"
204
+ data_type: TYPE_STRING
205
+ dims: [ -1 ]
206
+ },
207
+ {
208
+ name: "cum_log_probs"
209
+ data_type: TYPE_FP32
210
+ dims: [ -1 ]
211
+ },
212
+ {
213
+ name: "output_log_probs"
214
+ data_type: TYPE_FP32
215
+ dims: [ -1, -1 ]
216
+ },
217
+ {
218
+ name: "context_logits"
219
+ data_type: TYPE_FP32
220
+ dims: [ -1, -1 ]
221
+ },
222
+ {
223
+ name: "generation_logits"
224
+ data_type: TYPE_FP32
225
+ dims: [ -1, -1, -1 ]
226
+ }
227
+ ]
228
+
229
+ parameters: {
230
+ key: "accumulate_tokens"
231
+ value: {
232
+ string_value: "${accumulate_tokens}"
233
+ }
234
+ }
235
+ parameters: {
236
+ key: "tensorrt_llm_model_name"
237
+ value: {
238
+ string_value: "${tensorrt_llm_model_name}"
239
+ }
240
+ }
241
+ parameters: {
242
+ key: "tensorrt_llm_draft_model_name"
243
+ value: {
244
+ string_value: "${tensorrt_llm_draft_model_name}"
245
+ }
246
+ }
247
+
248
+ instance_group [
249
+ {
250
+ count: 1
251
+ kind : KIND_CPU
252
+ }
253
+ ]