Devops-hestabit
/

mixtral-instruct-trt-quant

Model card Files Files and versions Community

Devops-hestabit commited on Jul 1, 2024

Commit

61e6a6c

verified ·

1 Parent(s): 861b1a0

Upload folder using huggingface_hub

Browse files

Files changed (29) hide show

.gitattributes +2 -0
ensemble/1/.tmp +0 -0
ensemble/config.pbtxt +470 -0
postprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
postprocessing/1/model.py +231 -0
postprocessing/1/special_tokens_map.json +5 -0
postprocessing/1/tokenizer.json +0 -0
postprocessing/1/tokenizer.model +3 -0
postprocessing/1/tokenizer_config.json +43 -0
postprocessing/config.pbtxt +113 -0
preprocessing/1/__pycache__/model.cpython-310.pyc +0 -0
preprocessing/1/model.py +373 -0
preprocessing/1/special_tokens_map.json +5 -0
preprocessing/1/tokenizer.json +0 -0
preprocessing/1/tokenizer.model +3 -0
preprocessing/1/tokenizer_config.json +43 -0
preprocessing/config.pbtxt +156 -0
tensorrt_llm/1/.gitkeep +0 -0
tensorrt_llm/1/config.json +148 -0
tensorrt_llm/1/model.py +782 -0
tensorrt_llm/1/rank0.engine +3 -0
tensorrt_llm/config.pbtxt +537 -0
tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc +0 -0
tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc +0 -0
tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc +0 -0
tensorrt_llm_bls/1/lib/decode.py +333 -0
tensorrt_llm_bls/1/lib/triton_decoder.py +440 -0
tensorrt_llm_bls/1/model.py +131 -0
tensorrt_llm_bls/config.pbtxt +253 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tensorrt_llm/1/rank0.engine filter=lfs diff=lfs merge=lfs -text
+tensorrt_llm/1/rank1.engine filter=lfs diff=lfs merge=lfs -text

ensemble/1/.tmp ADDED Viewed

File without changes

ensemble/config.pbtxt ADDED Viewed

	@@ -0,0 +1,470 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "ensemble"
+platform: "ensemble"
+max_batch_size: 16
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "decoder_text_input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "max_tokens"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+   name: "bad_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+   name: "stop_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "length_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "stream"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+      name: "embedding_bias_words"
+      data_type: TYPE_STRING
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "embedding_bias_weights"
+      data_type: TYPE_FP32
+      dims: [ -1 ]
+      optional: true
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "preprocessing"
+      model_version: -1
+      input_map {
+        key: "QUERY"
+        value: "text_input"
+      }
+      input_map {
+        key: "DECODER_QUERY"
+        value: "decoder_text_input"
+      }
+      input_map {
+        key: "REQUEST_OUTPUT_LEN"
+        value: "max_tokens"
+      }
+      input_map {
+        key: "BAD_WORDS_DICT"
+        value: "bad_words"
+      }
+      input_map {
+        key: "STOP_WORDS_DICT"
+        value: "stop_words"
+      }
+      input_map {
+        key: "EMBEDDING_BIAS_WORDS"
+        value: "embedding_bias_words"
+      }
+      input_map {
+        key: "EMBEDDING_BIAS_WEIGHTS"
+        value: "embedding_bias_weights"
+      }
+      input_map {
+        key: "END_ID"
+        value: "end_id"
+      }
+      input_map {
+        key: "PAD_ID"
+        value: "pad_id"
+      }
+      output_map {
+        key: "REQUEST_INPUT_LEN"
+        value: "_REQUEST_INPUT_LEN"
+      }
+      output_map {
+        key: "INPUT_ID"
+        value: "_INPUT_ID"
+      }
+      output_map {
+        key: "REQUEST_DECODER_INPUT_LEN"
+        value: "_REQUEST_DECODER_INPUT_LEN"
+      }
+      output_map {
+        key: "DECODER_INPUT_ID"
+        value: "_DECODER_INPUT_ID"
+      }
+      output_map {
+        key: "REQUEST_OUTPUT_LEN"
+        value: "_REQUEST_OUTPUT_LEN"
+      }
+      output_map {
+        key: "STOP_WORDS_IDS"
+        value: "_STOP_WORDS_IDS"
+      }
+      output_map {
+        key: "BAD_WORDS_IDS"
+        value: "_BAD_WORDS_IDS"
+      }
+      output_map {
+        key: "EMBEDDING_BIAS"
+        value: "_EMBEDDING_BIAS"
+      }
+      output_map {
+        key: "OUT_END_ID"
+        value: "_PREPROCESSOR_END_ID"
+      }
+      output_map {
+        key: "OUT_PAD_ID"
+        value: "_PREPROCESSOR_PAD_ID"
+      }
+    },
+    {
+      model_name: "tensorrt_llm"
+      model_version: -1
+      input_map {
+        key: "input_ids"
+        value: "_INPUT_ID"
+      }
+      input_map {
+        key: "decoder_input_ids"
+        value: "_DECODER_INPUT_ID"
+      }
+      input_map {
+        key: "input_lengths"
+        value: "_REQUEST_INPUT_LEN"
+      }
+      input_map {
+        key: "decoder_input_lengths"
+        value: "_REQUEST_DECODER_INPUT_LEN"
+      }
+      input_map {
+        key: "request_output_len"
+        value: "_REQUEST_OUTPUT_LEN"
+      }
+      input_map {
+          key: "end_id"
+          value: "_PREPROCESSOR_END_ID"
+      }
+      input_map {
+          key: "pad_id"
+          value: "_PREPROCESSOR_PAD_ID"
+      }
+      input_map {
+          key: "embedding_bias"
+          value: "_EMBEDDING_BIAS"
+      }
+      input_map {
+          key: "runtime_top_k"
+          value: "top_k"
+      }
+      input_map {
+          key: "runtime_top_p"
+          value: "top_p"
+      }
+      input_map {
+          key: "temperature"
+          value: "temperature"
+      }
+      input_map {
+          key: "len_penalty"
+          value: "length_penalty"
+      }
+      input_map {
+          key: "repetition_penalty"
+          value: "repetition_penalty"
+      }
+      input_map {
+          key: "min_length"
+          value: "min_length"
+      }
+      input_map {
+          key: "presence_penalty"
+          value: "presence_penalty"
+      }
+      input_map {
+          key: "frequency_penalty"
+          value: "frequency_penalty"
+      }
+      input_map {
+          key: "random_seed"
+          value: "random_seed"
+      }
+      input_map {
+          key: "return_log_probs"
+          value: "return_log_probs"
+      }
+      input_map {
+          key: "return_context_logits"
+          value: "return_context_logits"
+      }
+      input_map {
+          key: "return_generation_logits"
+          value: "return_generation_logits"
+      }
+      input_map {
+          key: "beam_width"
+          value: "beam_width"
+      }
+      input_map {
+          key: "streaming"
+          value: "stream"
+      }
+      input_map {
+        key: "prompt_embedding_table"
+        value: "prompt_embedding_table"
+      }
+      input_map {
+        key: "prompt_vocab_size"
+        value: "prompt_vocab_size"
+      }
+      input_map {
+        key: "stop_words_list"
+        value: "_STOP_WORDS_IDS"
+      }
+      input_map {
+        key: "bad_words_list"
+        value: "_BAD_WORDS_IDS"
+      }
+      output_map {
+        key: "output_ids"
+        value: "_TOKENS_BATCH"
+      }
+      output_map {
+        key: "sequence_length"
+        value: "_SEQUENCE_LENGTH"
+      },
+      output_map {
+        key: "cum_log_probs"
+        value: "_CUM_LOG_PROBS"
+      }
+      output_map {
+        key: "output_log_probs"
+        value: "_OUTPUT_LOG_PROBS"
+      },
+      output_map {
+        key: "context_logits"
+        value: "_CONTEXT_LOGITS"
+      },
+      output_map {
+        key: "generation_logits"
+        value: "_GENERATION_LOGITS"
+      }
+    },
+    {
+      model_name: "postprocessing"
+      model_version: -1
+      input_map {
+        key: "TOKENS_BATCH"
+        value: "_TOKENS_BATCH"
+      }
+      input_map {
+        key: "CUM_LOG_PROBS"
+        value: "_CUM_LOG_PROBS"
+      }
+      input_map {
+        key: "OUTPUT_LOG_PROBS"
+        value: "_OUTPUT_LOG_PROBS"
+      }
+      input_map {
+        key: "CONTEXT_LOGITS"
+        value: "_CONTEXT_LOGITS"
+      }
+      input_map {
+        key: "GENERATION_LOGITS"
+        value: "_GENERATION_LOGITS"
+      }
+      input_map {
+        key: "SEQUENCE_LENGTH"
+        value: "_SEQUENCE_LENGTH"
+      }
+      output_map {
+        key: "OUTPUT"
+        value: "text_output"
+      }
+      output_map {
+        key: "OUT_OUTPUT_LOG_PROBS"
+        value: "output_log_probs"
+      }
+      output_map {
+        key: "OUT_CUM_LOG_PROBS"
+        value: "cum_log_probs"
+      }
+      output_map {
+        key: "OUT_CONTEXT_LOGITS"
+        value: "context_logits"
+      }
+      output_map {
+        key: "OUT_GENERATION_LOGITS"
+        value: "generation_logits"
+      }
+    }
+  ]
+}

postprocessing/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (5.33 kB). View file

postprocessing/1/model.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from transformers import AutoTokenizer
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+            'string_value']
+        skip_special_tokens = model_config['parameters'].get(
+            'skip_special_tokens')
+        if skip_special_tokens is not None:
+            skip_special_tokens_str = skip_special_tokens[
+                'string_value'].lower()
+            if skip_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.skip_special_tokens = skip_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.skip_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
+            )
+            self.skip_special_tokens = True
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                       legacy=False,
+                                                       padding_side='left',
+                                                       trust_remote_code=True)
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Parse model output configs
+        output_config = pb_utils.get_output_config_by_name(
+            model_config, "OUTPUT")
+        # Convert Triton types to numpy types
+        self.output_dtype = pb_utils.triton_string_to_numpy(
+            output_config['data_type'])
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        responses = []
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            tokens_batch = pb_utils.get_input_tensor_by_name(
+                request, 'TOKENS_BATCH').as_numpy()
+            # Get sequence length
+            sequence_lengths = pb_utils.get_input_tensor_by_name(
+                request, 'SEQUENCE_LENGTH').as_numpy()
+            # Get cum log probs
+            cum_log_probs = pb_utils.get_input_tensor_by_name(
+                request, 'CUM_LOG_PROBS')
+            # Get sequence length
+            output_log_probs = pb_utils.get_input_tensor_by_name(
+                request, 'OUTPUT_LOG_PROBS')
+            # Get context logits
+            context_logits = pb_utils.get_input_tensor_by_name(
+                request, 'CONTEXT_LOGITS')
+            # Get generation logits
+            generation_logits = pb_utils.get_input_tensor_by_name(
+                request, 'GENERATION_LOGITS')
+            # Reshape Input
+            # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
+            # tokens_batch = tokens_batch.T
+            # Postprocessing output data.
+            outputs = self._postprocessing(tokens_batch, sequence_lengths)
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            output_tensor = pb_utils.Tensor(
+                'OUTPUT',
+                np.array(outputs).astype(self.output_dtype))
+            outputs = []
+            outputs.append(output_tensor)
+            if cum_log_probs:
+                out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
+                                                    cum_log_probs.as_numpy())
+                outputs.append(out_cum_log_probs)
+            else:
+                out_cum_log_probs = pb_utils.Tensor(
+                    'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
+                outputs.append(out_cum_log_probs)
+            if output_log_probs:
+                out_output_log_probs = pb_utils.Tensor(
+                    'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
+                outputs.append(out_output_log_probs)
+            else:
+                out_output_log_probs = pb_utils.Tensor(
+                    'OUT_OUTPUT_LOG_PROBS',
+                    np.array([[[0.0]]], dtype=np.float32))
+                outputs.append(out_output_log_probs)
+            if context_logits:
+                out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
+                                                     context_logits.as_numpy())
+                outputs.append(out_context_logits)
+            else:
+                out_context_logits = pb_utils.Tensor(
+                    'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
+                                                   dtype=np.float32))
+                outputs.append(out_context_logits)
+            if generation_logits:
+                out_generation_logits = pb_utils.Tensor(
+                    'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
+                outputs.append(out_generation_logits)
+            else:
+                out_generation_logits = pb_utils.Tensor(
+                    'OUT_GENERATION_LOGITS',
+                    np.array([[[[0.0]]]], dtype=np.float32))
+                outputs.append(out_generation_logits)
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=outputs)
+            responses.append(inference_response)
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+    def _postprocessing(self, tokens_batch, sequence_lengths):
+        outputs = []
+        for batch_idx, beam_tokens in enumerate(tokens_batch):
+            for beam_idx, tokens in enumerate(beam_tokens):
+                seq_len = sequence_lengths[batch_idx][beam_idx]
+                output = self.tokenizer.decode(
+                    tokens[:seq_len],
+                    skip_special_tokens=self.skip_special_tokens)
+                outputs.append(output.encode('utf8'))
+        return outputs

postprocessing/1/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "unk_token": "<unk>"
+}

postprocessing/1/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

postprocessing/1/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

postprocessing/1/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+}

postprocessing/config.pbtxt ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "postprocessing"
+backend: "python"
+max_batch_size: 16
+input [
+  {
+    name: "TOKENS_BATCH"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "SEQUENCE_LENGTH"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "CUM_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "OUTPUT_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "CONTEXT_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "GENERATION_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+    optional: true
+  }
+]
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "OUT_CUM_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "OUT_OUTPUT_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "OUT_CONTEXT_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "OUT_GENERATION_LOGITS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  }
+]
+parameters {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "/all_models/inflight_batcher_llm/postprocessing/1"
+  }
+}
+parameters {
+  key: "skip_special_tokens"
+  value: {
+    string_value: "${skip_special_tokens}"
+  }
+}
+instance_group [
+    {
+        count: 1
+        kind: KIND_CPU
+    }
+]

preprocessing/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (9.56 kB). View file

preprocessing/1/model.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+from typing import List
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from transformers import AutoTokenizer, T5Tokenizer
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+            'string_value']
+        add_special_tokens = model_config['parameters'].get(
+            'add_special_tokens')
+        if add_special_tokens is not None:
+            add_special_tokens_str = add_special_tokens['string_value'].lower()
+            if add_special_tokens_str in [
+                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
+            ]:
+                self.add_special_tokens = add_special_tokens_str in [
+                    'true', '1', 't', 'y', 'yes'
+                ]
+            else:
+                print(
+                    f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
+                )
+                self.add_special_tokens = True
+        else:
+            print(
+                f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
+            )
+            self.add_special_tokens = True
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                       legacy=False,
+                                                       padding_side='left',
+                                                       trust_remote_code=True)
+        if isinstance(self.tokenizer, T5Tokenizer):
+            self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer_end_id = self.tokenizer.encode(
+            self.tokenizer.eos_token, add_special_tokens=False)[0]
+        self.tokenizer_pad_id = self.tokenizer.encode(
+            self.tokenizer.pad_token, add_special_tokens=False)[0]
+        # Parse model output configs and convert Triton types to numpy types
+        output_names = [
+            "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
+            "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
+            "OUT_END_ID", "OUT_PAD_ID"
+        ]
+        input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
+        for input_name in input_names:
+            setattr(
+                self,
+                input_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_input_config_by_name(
+                        model_config, input_name)['data_type']))
+        for output_name in output_names:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(
+                        model_config, output_name)['data_type']))
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        responses = []
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        logger = pb_utils.Logger
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            query = pb_utils.get_input_tensor_by_name(request,
+                                                      'QUERY').as_numpy()
+            decoder_query = pb_utils.get_input_tensor_by_name(
+                request, 'DECODER_QUERY')
+            if decoder_query is not None:
+                decoder_query = decoder_query.as_numpy()
+            batch_dim = query.shape[0]
+            if batch_dim != 1:
+                err_str = "Inflight batching backend expects requests with batch size of 1."
+                logger.log_error(err_str)
+                responses.append(
+                    pb_utils.InferenceResponse(
+                        output_tensors=[],
+                        error=pb_utils.TritonError(err_str)))
+                continue
+            request_output_len = pb_utils.get_input_tensor_by_name(
+                request, 'REQUEST_OUTPUT_LEN').as_numpy()
+            bad_words_dict = pb_utils.get_input_tensor_by_name(
+                request, 'BAD_WORDS_DICT')
+            if bad_words_dict is not None:
+                bad_words_dict = bad_words_dict.as_numpy()
+            stop_words_dict = pb_utils.get_input_tensor_by_name(
+                request, 'STOP_WORDS_DICT')
+            if stop_words_dict is not None:
+                stop_words_dict = stop_words_dict.as_numpy()
+            embedding_bias_words = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WORDS')
+            if embedding_bias_words is not None:
+                embedding_bias_words = embedding_bias_words.as_numpy()
+            embedding_bias_weights = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WEIGHTS')
+            if embedding_bias_weights is not None:
+                embedding_bias_weights = embedding_bias_weights.as_numpy()
+            # Take the end_id from the input tensors
+            # If not specified, use tokenizer to get end_id
+            end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
+            if end_id is not None:
+                end_id = end_id.as_numpy()
+            else:
+                end_id = [[self.tokenizer_end_id]]
+            # Take the pad_id from the input tensors
+            # If not specified, use tokenizer to get pad_id
+            pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
+            if pad_id is not None:
+                pad_id = pad_id.as_numpy()
+            else:
+                pad_id = [[self.tokenizer_pad_id]]
+            # Preprocessing input data.
+            input_id, request_input_len = self._create_request(query)
+            print(input_id)
+            print(request_input_len)
+            if decoder_query is not None:
+                decoder_input_id, request_decoder_input_len = self._create_request(
+                    decoder_query)
+            else:
+                decoder_input_id = pad_id * np.ones((1, 1), np.int32)
+                request_decoder_input_len = 1 * np.ones((1, 1), np.int32)
+            bad_words = self._to_word_list_format(bad_words_dict)
+            stop_words = self._to_word_list_format(stop_words_dict)
+            embedding_bias = self._get_embedding_bias(
+                embedding_bias_words, embedding_bias_weights,
+                self.embedding_bias_weights_dtype)
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            input_id_tensor = pb_utils.Tensor(
+                'INPUT_ID', input_id.astype(self.input_id_dtype))
+            request_input_len_tensor = pb_utils.Tensor(
+                'REQUEST_INPUT_LEN',
+                request_input_len.astype(self.request_input_len_dtype))
+            decoder_input_id_tensor = pb_utils.Tensor(
+                'DECODER_INPUT_ID',
+                decoder_input_id.astype(self.decoder_input_id_dtype))
+            request_decoder_input_len_tensor = pb_utils.Tensor(
+                'REQUEST_DECODER_INPUT_LEN',
+                request_decoder_input_len.astype(
+                    self.request_decoder_input_len_dtype))
+            request_output_len_tensor = pb_utils.Tensor(
+                'REQUEST_OUTPUT_LEN', request_output_len)
+            bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
+            stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
+                                                    stop_words)
+            embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
+                                                    embedding_bias)
+            end_id_tensor = pb_utils.Tensor('OUT_END_ID',
+                                            np.array(end_id, dtype=np.int32))
+            pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
+                                            np.array(pad_id, dtype=np.int32))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[
+                input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
+                stop_words_ids_tensor, request_input_len_tensor,
+                request_decoder_input_len_tensor, request_output_len_tensor,
+                embedding_bias_tensor, end_id_tensor, pad_id_tensor
+            ])
+            responses.append(inference_response)
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+    def _create_request(self, query):
+        """
+            query : batch string (2D numpy array)
+        """
+        if isinstance(self.tokenizer, T5Tokenizer):
+            start_ids = [
+                np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
+                    s[0].decode(), add_special_tokens=self.add_special_tokens)
+                         ).astype(int) for s in query
+            ]
+        else:
+            start_ids = [
+                np.array(
+                    self.tokenizer.encode(
+                        s[0].decode(),
+                        add_special_tokens=self.add_special_tokens)).astype(
+                            int) for s in query
+            ]
+        start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
+        max_len = 0
+        for seq in start_ids:
+            max_len = max(max_len, seq.shape[0])
+        start_ids = np.stack([
+            np.pad(seq, (0, max_len - seq.shape[0]),
+                   'constant',
+                   constant_values=(0, self.tokenizer_pad_id))
+            for seq in start_ids
+        ])
+        return start_ids, start_lengths
+    def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
+        '''
+        word_lists format:
+            len(word_lists) == batch_size
+            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
+        '''
+        assert self.tokenizer != None, "need to set tokenizer"
+        if word_lists is None:
+            # Return an empty array of shape (1,2,0)
+            return np.empty([1, 2, 0], dtype="int32")
+        flat_ids = []
+        offsets = []
+        for word_list in word_lists:
+            item_flat_ids = []
+            item_offsets = []
+            for word in word_list:
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word, add_special_tokens=False)
+                if len(ids) == 0:
+                    continue
+                item_flat_ids += ids
+                item_offsets.append(len(ids))
+            flat_ids.append(np.array(item_flat_ids))
+            offsets.append(np.cumsum(np.array(item_offsets)))
+        pad_to = max(1, max(len(ids) for ids in flat_ids))
+        for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+            flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
+                                 constant_values=0)
+            offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
+                                constant_values=-1)
+        return np.array([flat_ids, offsets], dtype="int32").transpose(
+            (1, 0, 2))
+    def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
+                            bias_dtype):
+        assert self.tokenizer != None, "need to set tokenizer"
+        if embedding_bias_words is None or embedding_bias_weights is None:
+            return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
+        batch_embedding_bias = []
+        for words, weights in zip(embedding_bias_words,
+                                  embedding_bias_weights):
+            vocab_size = self.tokenizer.vocab_size
+            embedding_bias = [0.] * vocab_size
+            assert len(words) == len(
+                weights
+            ), "Embedding bias words must have same dimension as embedding bias weights"
+            for word, weight in zip(words, weights):
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word)
+                if len(ids) == 0:
+                    continue
+                for id in ids:
+                    embedding_bias[id] += weight
+            batch_embedding_bias.append(np.array(embedding_bias))
+        return np.array(batch_embedding_bias, dtype=bias_dtype)

preprocessing/1/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "unk_token": "<unk>"
+}

preprocessing/1/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessing/1/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

preprocessing/1/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+}

preprocessing/config.pbtxt ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "preprocessing"
+backend: "python"
+max_batch_size: 16
+input [
+    {
+        name: "QUERY"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    },
+    {
+        name: "DECODER_QUERY"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "BAD_WORDS_DICT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "STOP_WORDS_DICT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "EMBEDDING_BIAS_WORDS"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "EMBEDDING_BIAS_WEIGHTS"
+        data_type: TYPE_FP32
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "END_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "PAD_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+        optional: true
+    }
+]
+output [
+    {
+        name: "INPUT_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_INPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "DECODER_INPUT_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_DECODER_INPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "BAD_WORDS_IDS"
+        data_type: TYPE_INT32
+        dims: [ 2, -1 ]
+    },
+    {
+        name: "STOP_WORDS_IDS"
+        data_type: TYPE_INT32
+        dims: [ 2, -1 ]
+    },
+    {
+        name: "EMBEDDING_BIAS"
+        data_type: TYPE_FP32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "OUT_END_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "OUT_PAD_ID"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    }
+]
+parameters {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "/all_models/inflight_batcher_llm/postprocessing/1"
+  }
+}
+parameters {
+  key: "add_special_tokens"
+  value: {
+    string_value: "${add_special_tokens}"
+  }
+}
+instance_group [
+    {
+        count: 1
+        kind: KIND_CPU
+    }
+]

tensorrt_llm/1/.gitkeep ADDED Viewed

File without changes

tensorrt_llm/1/config.json ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+    "version": "0.11.0.dev2024062500",
+    "pretrained_config": {
+        "mlp_bias": false,
+        "attn_bias": false,
+        "rotary_base": 1000000.0,
+        "rotary_scaling": null,
+        "residual_mlp": false,
+        "disable_weight_only_quant_plugin": false,
+        "moe": {
+            "num_experts": 8,
+            "top_k": 2,
+            "normalization_mode": 1
+        },
+        "architecture": "LlamaForCausalLM",
+        "dtype": "float16",
+        "vocab_size": 32000,
+        "hidden_size": 4096,
+        "num_hidden_layers": 32,
+        "num_attention_heads": 32,
+        "hidden_act": "swiglu",
+        "logits_dtype": "float32",
+        "norm_epsilon": 1e-05,
+        "position_embedding_type": "rope_gpt_neox",
+        "max_position_embeddings": 32768,
+        "num_key_value_heads": 8,
+        "intermediate_size": 14336,
+        "mapping": {
+            "world_size": 1,
+            "gpus_per_node": 8,
+            "tp_size": 1,
+            "pp_size": 1,
+            "moe_tp_size": 1,
+            "moe_ep_size": 1
+        },
+        "quantization": {
+            "quant_algo": "W8A16",
+            "kv_cache_quant_algo": null,
+            "group_size": 128,
+            "smoothquant_val": null,
+            "has_zero_point": false,
+            "pre_quant_scale": false,
+            "exclude_modules": null
+        },
+        "use_parallel_embedding": false,
+        "embedding_sharding_dim": 0,
+        "share_embedding_table": false,
+        "head_size": 128,
+        "qk_layernorm": false
+    },
+    "build_config": {
+        "max_input_len": 28000,
+        "max_seq_len": 32000,
+        "opt_batch_size": null,
+        "max_batch_size": 16,
+        "max_beam_width": 1,
+        "max_num_tokens": 32000,
+        "opt_num_tokens": 16,
+        "max_prompt_embedding_table_size": 0,
+        "gather_context_logits": false,
+        "gather_generation_logits": false,
+        "strongly_typed": true,
+        "builder_opt": null,
+        "profiling_verbosity": "layer_names_only",
+        "enable_debug_output": false,
+        "max_draft_len": 0,
+        "speculative_decoding_mode": 1,
+        "use_refit": false,
+        "input_timing_cache": null,
+        "output_timing_cache": "model.cache",
+        "lora_config": {
+            "lora_dir": [],
+            "lora_ckpt_source": "hf",
+            "max_lora_rank": 64,
+            "lora_target_modules": [],
+            "trtllm_modules_to_hf_modules": {}
+        },
+        "auto_parallel_config": {
+            "world_size": 1,
+            "gpus_per_node": 8,
+            "cluster_key": "A100-SXM-80GB",
+            "cluster_info": null,
+            "sharding_cost_model": "alpha_beta",
+            "comm_cost_model": "alpha_beta",
+            "enable_pipeline_parallelism": false,
+            "enable_shard_unbalanced_shape": false,
+            "enable_shard_dynamic_shape": false,
+            "enable_reduce_scatter": true,
+            "builder_flags": null,
+            "debug_mode": false,
+            "infer_shape": true,
+            "validation_mode": false,
+            "same_buffer_io": {
+                "past_key_value_(\\d+)": "present_key_value_\\1"
+            },
+            "same_spec_io": {},
+            "sharded_io_allowlist": [
+                "past_key_value_\\d+",
+                "present_key_value_\\d*"
+            ],
+            "fill_weights": false,
+            "parallel_config_cache": null,
+            "profile_cache": null,
+            "dump_path": null,
+            "debug_outputs": []
+        },
+        "weight_sparsity": false,
+        "weight_streaming": false,
+        "plugin_config": {
+            "dtype": "float16",
+            "bert_attention_plugin": "auto",
+            "gpt_attention_plugin": "auto",
+            "gemm_plugin": "float16",
+            "gemm_swiglu_plugin": null,
+            "smooth_quant_gemm_plugin": null,
+            "identity_plugin": null,
+            "layernorm_quantization_plugin": null,
+            "rmsnorm_quantization_plugin": null,
+            "nccl_plugin": null,
+            "lookup_plugin": null,
+            "lora_plugin": null,
+            "weight_only_groupwise_quant_matmul_plugin": null,
+            "weight_only_quant_matmul_plugin": "float16",
+            "quantize_per_token_plugin": false,
+            "quantize_tensor_plugin": false,
+            "moe_plugin": "auto",
+            "mamba_conv1d_plugin": "auto",
+            "context_fmha": true,
+            "context_fmha_fp32_acc": false,
+            "paged_kv_cache": true,
+            "remove_input_padding": true,
+            "use_custom_all_reduce": true,
+            "reduce_fusion": false,
+            "multi_block_mode": false,
+            "enable_xqa": true,
+            "attention_qk_half_accumulation": false,
+            "tokens_per_block": 64,
+            "use_paged_context_fmha": false,
+            "use_fp8_context_fmha": false,
+            "multiple_profiles": false,
+            "paged_state": true,
+            "streamingllm": false
+        },
+        "use_strip_plan": false,
+        "max_encoder_input_len": 1024,
+        "use_fused_mlp": false
+    }
+}

tensorrt_llm/1/model.py ADDED Viewed

	@@ -0,0 +1,782 @@

+import datetime
+import json
+import os
+import time
+from threading import Lock, Thread
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from torch import from_numpy
+import tensorrt_llm.bindings.executor as trtllm
+def get_input_tensor_by_name(request, name):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    return tensor.as_numpy()
+def get_input_scalar_by_name(request, name):
+    tensor = get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    if tensor.size != 1:
+        raise pb_utils.TritonModelException(
+            f"Expected a single value for {name}")
+    return tensor.item()
+def read_parameter_as_type(value, name, pytype=str):
+    if value == "":
+        return None
+    if value.startswith("${") and value.endswith("}"):
+        return None
+    if pytype is bool:
+        return value.lower() in ["1", "true"]
+    try:
+        result = pytype(value)
+        return result
+    except:
+        pb_utils.Logger.log_warning(
+            f"Could not read parameter '{name}' with value '{value}', will use default."
+        )
+        return None
+def get_parameter(model_config, name, pytype=str):
+    if name not in model_config['parameters']:
+        return None
+    return read_parameter_as_type(
+        model_config['parameters'][name]['string_value'], name, pytype)
+def convert_word_list(word_list):
+    if word_list is None:
+        return None
+    word_list = word_list.tolist()
+    if len(word_list) == 0 or len(word_list[0]) != 2:
+        raise pb_utils.TritonModelException(f"Invalid format for word list.")
+    words, indices = word_list[0]
+    result = []
+    current_index = 0
+    for i in indices:
+        if i == -1:
+            continue
+        if i > len(words):
+            raise pb_utils.TritonModelException(
+                f"Invalid format for word list.")
+        current_word = []
+        while current_index < i:
+            current_word.append(words[current_index])
+            current_index += 1
+        result.append(current_word)
+    return result
+def parse_medusa_choices(medusa_choices):
+    if medusa_choices is None:
+        return None
+    try:
+        result = json.loads(
+            "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
+        assert isinstance(result, list) and len(result) > 0
+        assert all([isinstance(x, list) for x in result])
+        assert all([isinstance(y, int) for x in result for y in x])
+    except Exception:
+        raise pb_utils.TritonModelException(
+            "Invalid format for medusa_choices")
+    return result
+def get_sampling_config_from_request(request):
+    kwargs = {}
+    kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1
+    kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k')
+    kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p')
+    kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
+        'top_p'] <= 0 else kwargs['top_p']
+    kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed')
+    kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature')
+    kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length')
+    kwargs['repetition_penalty'] = get_input_scalar_by_name(
+        request, 'repetition_penalty')
+    kwargs['presence_penalty'] = get_input_scalar_by_name(
+        request, 'presence_penalty')
+    kwargs['frequency_penalty'] = get_input_scalar_by_name(
+        request, 'frequency_penalty')
+    kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty')
+    kwargs['top_p_min'] = get_input_scalar_by_name(request,
+                                                   'runtime_top_p_min')
+    kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
+        request, 'runtime_top_p_reset_ids')
+    kwargs['top_p_decay'] = get_input_scalar_by_name(request,
+                                                     'runtime_top_p_decay')
+    kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
+        request, 'beam_search_diversity_rate')
+    kwargs['early_stopping'] = get_input_scalar_by_name(
+        request, 'early_stopping')
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return trtllm.SamplingConfig(**kwargs)
+def get_output_config_from_request(request, exclude_input_from_output):
+    kwargs = {}
+    kwargs["return_log_probs"] = get_input_scalar_by_name(
+        request, 'return_log_probs')
+    kwargs["return_context_logits"] = get_input_scalar_by_name(
+        request, 'return_context_logits')
+    kwargs["return_generation_logits"] = get_input_scalar_by_name(
+        request, 'return_generation_logits')
+    kwargs["exclude_input_from_output"] = exclude_input_from_output
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return trtllm.OutputConfig(**kwargs)
+def get_external_draft_tokens_config_from_request(request):
+    kwargs = {}
+    draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids')
+    if draft_input_ids is not None:
+        kwargs['tokens'] = draft_input_ids.tolist()
+    draft_logits = get_input_tensor_by_name(request, 'draft_logits')
+    if draft_logits is not None:
+        kwargs['logits'] = from_numpy(draft_logits)
+    kwargs['acceptance_threshold'] = get_input_scalar_by_name(
+        request, 'draft_acceptance_threshold')
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.ExternalDraftTokensConfig(**kwargs)
+    return None
+def get_prompt_tuning_config_from_request(request):
+    # prompt_vocab_size is unused by executor.
+    kwargs = {}
+    prompt_embedding_table = get_input_tensor_by_name(
+        request, 'prompt_embedding_table')
+    if prompt_embedding_table is not None:
+        kwargs["embedding_table"] = from_numpy(prompt_embedding_table)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.PromptTuningConfig(**kwargs)
+    return None
+def get_lora_config_from_request(request):
+    kwargs = {}
+    kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id')
+    lora_weights = get_input_tensor_by_name(request, 'lora_weights')
+    if lora_weights is not None:
+        kwargs["weights"] = from_numpy(lora_weights)
+    lora_config = get_input_tensor_by_name(request, 'lora_config')
+    if lora_config is not None:
+        kwargs["config"] = from_numpy(lora_config)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.LoraConfig(**kwargs)
+    return None
+def convert_request(request, exclude_input_from_output, decoupled):
+    inputs = {}
+    input_token_ids = get_input_tensor_by_name(request, 'input_ids')
+    if input_token_ids is None:
+        raise pb_utils.TritonModelException(
+            "A value is required for input_ids")
+    input_token_ids = input_token_ids.tolist()
+    if len(input_token_ids) == 0:
+        raise pb_utils.TritonModelException(f"Invalid format for input_ids")
+    inputs['input_token_ids'] = input_token_ids[0]
+    # input_lengths is not not used by executor.
+    inputs['max_new_tokens'] = get_input_scalar_by_name(
+        request, 'request_output_len')
+    if inputs['max_new_tokens'] is None:
+        raise pb_utils.TritonModelException(
+            "A value is required for request_output_len")
+    inputs['streaming'] = get_input_scalar_by_name(request, 'streaming')
+    if inputs['streaming'] and not decoupled:
+        raise pb_utils.TritonModelException(
+            "Streaming is only supported in decoupled mode.")
+    inputs['end_id'] = get_input_scalar_by_name(request, 'end_id')
+    inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id')
+    inputs['stop_words'] = convert_word_list(
+        get_input_tensor_by_name(request, 'stop_words_list'))
+    inputs['bad_words'] = convert_word_list(
+        get_input_tensor_by_name(request, 'bad_words_list'))
+    embedding_bias = get_input_tensor_by_name(request, 'embedding_bias')
+    if embedding_bias is not None and embedding_bias.size != 0:
+        inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
+    sampling_config = get_sampling_config_from_request(request)
+    output_config = get_output_config_from_request(request,
+                                                   exclude_input_from_output)
+    external_draft_tokens_config = get_external_draft_tokens_config_from_request(
+        request)
+    prompt_tuning_config = get_prompt_tuning_config_from_request(request)
+    lora_config = get_lora_config_from_request(request)
+    return trtllm.Request(
+        **inputs,
+        sampling_config=sampling_config,
+        output_config=output_config,
+        external_draft_tokens_config=external_draft_tokens_config,
+        prompt_tuning_config=prompt_tuning_config,
+        lora_config=lora_config,
+    )
+def convert_response(response):
+    if response.has_error():
+        return pb_utils.InferenceResponse(output_tensors=[],
+                                          error=pb_utils.TritonError(
+                                              response.error_msg)), True
+    result = response.result
+    beam_lengths = np.expand_dims(
+        np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
+    max_beam_length = max([len(beam) for beam in result.output_token_ids])
+    output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
+                         -1, np.int32)
+    for idx, beam in enumerate(result.output_token_ids):
+        output_ids[0, idx, :len(beam)] = beam
+    output_tensors = [
+        pb_utils.Tensor("output_ids", output_ids),
+        pb_utils.Tensor("sequence_length", beam_lengths),
+    ]
+    output_tensors.append(
+        pb_utils.Tensor(
+            "cum_log_probs",
+            np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
+            if result.cum_log_probs is not None else np.zeros(
+                (1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "output_log_probs",
+            np.expand_dims(np.array(result.log_probs, np.float32), 0) if
+            result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "context_logits",
+            np.expand_dims(np.array(result.context_logits, np.float32), 0)
+            if result.context_logits is not None else np.zeros(
+                (1, 1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "generation_logits",
+            np.expand_dims(np.array(result.generation_logits, np.float32), 0)
+            if result.generation_logits is not None else np.zeros(
+                (1, 1, 1, 1), np.float32)))
+    return pb_utils.InferenceResponse(output_tensors), result.is_final
+def convert_scheduler_policy(batch_scheduler_policy: str):
+    if batch_scheduler_policy.lower() == "max_utilization":
+        return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
+    elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
+        return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
+    raise pb_utils.TritonModelException(
+        f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
+    )
+def convert_batching_type(gpt_model_type: str):
+    if gpt_model_type is None:
+        return None
+    if gpt_model_type.lower(
+    ) == "inflight_fused_batching" or gpt_model_type.lower(
+    ) == "inflight_batching":
+        return trtllm.BatchingType.INFLIGHT
+    elif gpt_model_type.lower() == "v1":
+        return trtllm.BatchingType.STATIC
+    raise pb_utils.TritonModelException(
+        f"gpt_model_type value of '{gpt_model_type}' is not supported.")
+def convert_decoding_mode(decoding_mode: str):
+    if decoding_mode is None:
+        return None
+    elif decoding_mode == "auto":
+        return trtllm.DecodingMode.Auto()
+    elif decoding_mode == "top_k":
+        return trtllm.DecodingMode.TopK()
+    elif decoding_mode == "top_p":
+        return trtllm.DecodingMode.TopP()
+    elif decoding_mode == "top_k_top_p":
+        return trtllm.DecodingMode.TopKTopP()
+    elif decoding_mode == "beam_search":
+        return trtllm.DecodingMode.BeamSearch()
+    elif decoding_mode == "medusa":
+        return trtllm.DecodingMode.Medusa()
+    raise pb_utils.TritonModelException(
+        f"decoding_mode value of '{decoding_mode}' is not supported.")
+def convert_timestamp_to_seconds(timestamp: str):
+    return int(
+        datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp())
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def get_scheduler_config(self, model_config):
+        batch_scheduler_policy = get_parameter(model_config,
+                                               "batch_scheduler_policy")
+        if batch_scheduler_policy is None:
+            return trtllm.SchedulerConfig()
+        return trtllm.SchedulerConfig(
+            convert_scheduler_policy(batch_scheduler_policy))
+    def get_kv_cache_config(self, model_config):
+        kwargs = {
+            "enable_block_reuse":
+            get_parameter(model_config, "enable_kv_cache_reuse", bool),
+            "max_tokens":
+            get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
+            "sink_token_length":
+            get_parameter(model_config, "sink_token_length", int),
+            "max_attention_window":
+            get_parameter(model_config, "max_attention_window_size", int),
+            "free_gpu_memory_fraction":
+            get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
+                          float),
+            "host_cache_size":
+            get_parameter(model_config, "kv_cache_host_memory_bytes", int),
+            "onboard_blocks":
+            get_parameter(model_config, "kv_cache_onboard_blocks", bool),
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.KvCacheConfig(**kwargs)
+    def get_parallel_config(self, model_config):
+        kwargs = {}
+        gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
+        if gpu_device_ids:
+            kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
+        self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
+                                                    "0") == "1"
+        if self.use_orchestrator_mode:
+            kwargs[
+                "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
+            worker_path = get_parameter(model_config, "worker_path")
+            if worker_path is not None:
+                raise pb_utils.TritonModelException(
+                    "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
+                )
+            executor_worker_path = get_parameter(model_config,
+                                                 "executor_worker_path")
+            kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
+                True, executor_worker_path)
+        if len(kwargs) > 0:
+            return trtllm.ParallelConfig(**kwargs)
+        return None
+    def get_peft_cache_config(self, model_config):
+        kwargs = {
+            "optimal_adapter_size":
+            get_parameter(model_config, "lora_cache_optimal_adapter_size",
+                          int),
+            "max_adapter_size":
+            get_parameter(model_config, "lora_cache_max_adapter_size", int),
+            "device_cache_percent":
+            get_parameter(model_config, "lora_cache_gpu_memory_fraction",
+                          float),
+            "host_cache_size":
+            get_parameter(model_config, "lora_cache_host_memory_bytes", int),
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.PeftCacheConfig(**kwargs)
+    def get_decoding_config(self, model_config):
+        kwargs = {
+            "medusa_choices":
+            parse_medusa_choices(get_parameter(model_config,
+                                               "medusa_choices")),
+            "decoding_mode":
+            convert_decoding_mode(get_parameter(model_config,
+                                                "decoding_mode")),
+        }
+        print(kwargs)
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.DecodingConfig(**kwargs)
+    def get_executor_config(self, model_config):
+        kwargs = {
+            "max_beam_width":
+            get_parameter(model_config, "max_beam_width", int),
+            "scheduler_config":
+            self.get_scheduler_config(model_config),
+            "kv_cache_config":
+            self.get_kv_cache_config(model_config),
+            "enable_chunked_context":
+            get_parameter(model_config, "enable_chunked_context", bool),
+            "normalize_log_probs":
+            get_parameter(model_config, "normalize_log_probs", bool),
+            "batching_type":
+            convert_batching_type(get_parameter(model_config,
+                                                "gpt_model_type")),
+            "parallel_config":
+            self.get_parallel_config(model_config),
+            "peft_cache_config":
+            self.get_peft_cache_config(model_config),
+            "decoding_config":
+            self.get_decoding_config(model_config),
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.ExecutorConfig(**kwargs)
+    def create_metrics(self, model: str, version: str, is_v1_model: bool):
+        self.request_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_request_metrics",
+            description="TRT LLM request metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.runtime_memory_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_runtime_memory_metrics",
+            description="TRT LLM runtime memory metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.kv_cache_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_kv_cache_block_metrics",
+            description="TRT LLM KV cache block metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        model_type = "v1" if is_v1_model else "inflight_batcher"
+        self.model_type_metric_family = pb_utils.MetricFamily(
+            name=f"nv_trt_llm_{model_type}_metrics",
+            description=f"TRT LLM {model_type}-specific metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.general_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_general_metrics",
+            description="General TRT LLM metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        common_labels = {"model": model, "version": version}
+        self.all_metrics = {
+            # Request metrics
+            "num_active_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "active",
+                **common_labels
+            }),
+            "max_num_active_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "max",
+                **common_labels
+            }),
+            "num_scheduled_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "scheduled",
+                **common_labels
+            }),
+            "num_context_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "context",
+                **common_labels
+            }),
+            # Runtime metrics
+            "cpu_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "cpu",
+                **common_labels
+            }),
+            "gpu_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "gpu",
+                **common_labels
+            }),
+            "pinned_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "pinned",
+                **common_labels
+            }),
+            # KV cache metrics
+            "max_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "max",
+                **common_labels
+            }),
+            "free_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "free",
+                **common_labels
+            }),
+            "used_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "used",
+                **common_labels
+            }),
+            "tokens_per_block":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "tokens_per",
+                **common_labels
+            }),
+            # General metrics
+            "timestamp":
+            self.general_metric_family.Metric(labels={
+                "general_type": "timestamp",
+                **common_labels
+            }),
+            "iter":
+            self.general_metric_family.Metric(labels={
+                "general_type": "iteration_counter",
+                **common_labels
+            }),
+        }
+        if is_v1_model:
+            self.all_metrics.update({
+                "num_ctx_tokens":
+                self.model_type_metric_family.Metric(labels={
+                    "v1_specific_metric": "total_context_tokens",
+                    **common_labels
+                }),
+                "num_gen_tokens":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "v1_specific_metric": "total_generation_tokens",
+                        **common_labels
+                    }),
+                "empty_gen_slots":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "v1_specific_metric": "empty_generation_slots",
+                        **common_labels
+                    }),
+            })
+        else:
+            self.all_metrics.update({
+                "num_ctx_tokens":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric":
+                        "total_context_tokens",
+                        **common_labels
+                    }),
+                "num_gen_requests":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric":
+                        "generation_requests",
+                        **common_labels
+                    }),
+                "micro_batch_id":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric": "micro_batch_id",
+                        **common_labels
+                    }),
+                "num_paused_requests":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric": "paused_requests",
+                        **common_labels
+                    }),
+            })
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        model_config = json.loads(args['model_config'])
+        gpt_model_path = get_parameter(model_config, "gpt_model_path")
+        if get_parameter(model_config, "enable_trt_overlap", bool):
+            raise pb_utils.TritonModelException(
+                f"enable_trt_overlap=true is not supported.")
+        self.exclude_input_from_output = get_parameter(
+            model_config, "exclude_input_in_output", bool)
+        executor_config = self.get_executor_config(model_config)
+        self.executor = trtllm.Executor(gpt_model_path,
+                                        trtllm.ModelType.DECODER_ONLY,
+                                        executor_config)
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config)
+        self.cancellation_check_period_ms = get_parameter(
+            model_config, "cancellation_check_period_ms", int) or 100
+        self.stats_check_period_ms = get_parameter(
+            model_config, "stats_check_period_ms", int) or 100
+        if not self.decoupled:
+            raise pb_utils.TritonModelException(
+                "Please enable decoupled transaction policy in the model configuration to serve this model"
+            )
+        self.create_metrics(args["model_name"],
+                            args["model_version"],
+                            is_v1_model=executor_config.batching_type ==
+                            trtllm.BatchingType.STATIC)
+        self.triton_id_to_req_id = {}
+        self.req_id_to_response_sender = {}
+        self.lock = Lock()
+        self.running = False
+        self.awaiter_thread = Thread(target=self.awaiter_loop)
+        self.cancellation_thread = Thread(target=self.cancellation_loop)
+        self.metrics_thread = Thread(target=self.metrics_loop)
+        if self.executor.can_enqueue_requests():
+            self.running = True
+            self.awaiter_thread.start()
+            self.cancellation_thread.start()
+            self.metrics_thread.start()
+        else:
+            # In leader mode, worker ranks will wait here until leader is done.
+            self.executor.shutdown()
+    def handle_stop_request(self, triton_id, response_sender):
+        if triton_id is None or triton_id == "":
+            response_sender.send(
+                pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                    "A request id must be provided for request cancellation")),
+                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            return
+        if triton_id in self.triton_id_to_req_id:
+            req_id = self.triton_id_to_req_id[triton_id]
+            self.executor.cancel_request(req_id)
+        response_sender.send(
+            pb_utils.InferenceResponse(),
+            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        if not self.executor.can_enqueue_requests():
+            return
+        # Convert to executor requests.
+        triton_requests = []
+        executor_requests = []
+        for request in requests:
+            response_sender = request.get_response_sender()
+            if get_input_scalar_by_name(request, 'stop'):
+                self.handle_stop_request(request.request_id(), response_sender)
+            else:
+                try:
+                    converted = convert_request(request,
+                                                self.exclude_input_from_output,
+                                                self.decoupled)
+                except Exception as e:
+                    response_sender.send(
+                        pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                            f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
+                        )),
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    triton_requests.append(request)
+                    executor_requests.append(converted)
+        with self.lock:
+            request_ids = self.executor.enqueue_requests(executor_requests)
+            for req_id, request in zip(request_ids, triton_requests):
+                triton_id = request.request_id()
+                self.req_id_to_response_sender[
+                    req_id] = triton_id, request.get_response_sender()
+                self.triton_id_to_req_id[triton_id] = req_id
+        return None
+    def awaiter_loop(self):
+        """Gets responses from executor and returns the results."""
+        while self.running:
+            for response in self.executor.await_responses(
+                    timeout=datetime.timedelta(milliseconds=1)):
+                req_id = response.request_id
+                with self.lock:
+                    if req_id not in self.req_id_to_response_sender:
+                        continue
+                    triton_id, response_sender = self.req_id_to_response_sender[
+                        req_id]
+                triton_response, is_final = convert_response(response)
+                response_sender.send(
+                    triton_response,
+                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                    if is_final else 0)
+                if is_final:
+                    with self.lock:
+                        del self.triton_id_to_req_id[triton_id]
+                        del self.req_id_to_response_sender[req_id]
+                # Remove local reference so response_sender can be cleaned properly.
+                del response_sender
+    def cancellation_loop(self):
+        """Checks if any pending requests have been cancelled."""
+        while self.running:
+            time.sleep(self.cancellation_check_period_ms / 1000.0)
+            with self.lock:
+                for req_id, (triton_id, response_sender
+                             ) in self.req_id_to_response_sender.items():
+                    if response_sender.is_cancelled():
+                        self.executor.cancel_request(req_id)
+                    # Remove local reference so response_sender can be cleaned properly.
+                    del response_sender
+    def metrics_loop(self):
+        """Updates triton metrics using stats from the executor."""
+        while self.running:
+            time.sleep(self.stats_check_period_ms / 1000.0)
+            for stat in self.executor.get_latest_iteration_stats():
+                try:
+                    for key, metric in self.all_metrics.items():
+                        value = None
+                        if hasattr(stat, key):
+                            value = getattr(stat, key)
+                        elif stat.kv_cache_stats is not None and hasattr(
+                                stat.kv_cache_stats, key):
+                            value = getattr(stat.kv_cache_stats, key)
+                        elif stat.static_batching_stats is not None and hasattr(
+                                stat.static_batching_stats, key):
+                            value = getattr(stat.static_batching_stats, key)
+                        elif stat.inflight_batching_stats is not None and hasattr(
+                                stat.inflight_batching_stats, key):
+                            value = getattr(stat.inflight_batching_stats, key)
+                        if value is not None:
+                            if key == "timestamp":
+                                value = convert_timestamp_to_seconds(value)
+                            metric.set(value)
+                        else:
+                            pb_utils.Logger.log_warn(
+                                f"Metric \"{key}\" not found.")
+                except Exception as e:
+                    pb_utils.Logger.log_warn(
+                        f"Error while processing metrics: {e}")
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        if self.executor.can_enqueue_requests():
+            self.running = False
+            self.awaiter_thread.join()
+            self.cancellation_thread.join()
+            self.metrics_thread.join()
+            self.executor.shutdown()

tensorrt_llm/1/rank0.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c545694cbc76c5a65d4650a2d7897cc98ab2382bce3d198acfa97c003bfea6c
+size 47006220780

tensorrt_llm/config.pbtxt ADDED Viewed

	@@ -0,0 +1,537 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm"
+backend: "tensorrtllm"
+max_batch_size: 16
+model_transaction_policy {
+  decoupled: true
+}
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    allow_ragged_batch: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "draft_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "draft_acceptance_threshold"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "embedding_bias"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_reset_ids"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "early_stopping"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "streaming"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+	data_type: TYPE_UINT64
+	dims: [ 1 ]
+    reshape: { shape: [ ] }
+	optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+	data_type: TYPE_FP16
+	dims: [ -1, -1 ]
+	optional: true
+	allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+	data_type: TYPE_INT32
+	dims: [ -1, 3 ]
+	optional: true
+	allow_ragged_batch: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+parameters: {
+  key: "max_beam_width"
+  value: {
+    string_value: "1"
+  }
+}
+parameters: {
+  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
+  value: {
+    string_value: "no"
+  }
+}
+parameters: {
+  key: "gpt_model_type"
+  value: {
+    string_value: "inflight_batching"
+  }
+}
+parameters: {
+  key: "gpt_model_path"
+  value: {
+    string_value: "/all_models/inflight_batcher_llm/tensorrt_llm/1"
+  }
+}
+parameters: {
+  key: "encoder_model_path"
+  value: {
+    string_value: "${encoder_engine_dir}"
+  }
+}
+parameters: {
+  key: "max_tokens_in_paged_kv_cache"
+  value: {
+    string_value: "${max_tokens_in_paged_kv_cache}"
+  }
+}
+parameters: {
+  key: "max_attention_window_size"
+  value: {
+    string_value: "${max_attention_window_size}"
+  }
+}
+parameters: {
+  key: "sink_token_length"
+  value: {
+    string_value: "${sink_token_length}"
+  }
+}
+parameters: {
+  key: "batch_scheduler_policy"
+  value: {
+    string_value: "guaranteed_no_evict"
+  }
+}
+parameters: {
+  key: "kv_cache_free_gpu_mem_fraction"
+  value: {
+    string_value: "0.8"
+  }
+}
+parameters: {
+  key: "kv_cache_host_memory_bytes"
+  value: {
+    string_value: "${kv_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "kv_cache_onboard_blocks"
+  value: {
+    string_value: "${kv_cache_onboard_blocks}"
+  }
+}
+# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
+# parameters: {
+#   key: "enable_trt_overlap"
+#   value: {
+#     string_value: "${enable_trt_overlap}"
+#   }
+# }
+parameters: {
+  key: "exclude_input_in_output"
+  value: {
+    string_value: "true"
+  }
+}
+parameters: {
+  key: "cancellation_check_period_ms"
+  value: {
+    string_value: "${cancellation_check_period_ms}"
+  }
+}
+parameters: {
+  key: "stats_check_period_ms"
+  value: {
+    string_value: "${stats_check_period_ms}"
+  }
+}
+parameters: {
+  key: "iter_stats_max_iterations"
+  value: {
+    string_value: "${iter_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "request_stats_max_iterations"
+  value: {
+    string_value: "${request_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "enable_kv_cache_reuse"
+  value: {
+    string_value: "${enable_kv_cache_reuse}"
+  }
+}
+parameters: {
+  key: "normalize_log_probs"
+  value: {
+    string_value: "${normalize_log_probs}"
+  }
+}
+parameters: {
+  key: "enable_chunked_context"
+  value: {
+    string_value: "${enable_chunked_context}"
+  }
+}
+parameters: {
+  key: "gpu_device_ids"
+  value: {
+    string_value: "${gpu_device_ids}"
+  }
+}
+parameters: {
+  key: "lora_cache_optimal_adapter_size"
+  value: {
+    string_value: "${lora_cache_optimal_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_max_adapter_size"
+  value: {
+    string_value: "${lora_cache_max_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_gpu_memory_fraction"
+  value: {
+    string_value: "${lora_cache_gpu_memory_fraction}"
+  }
+}
+parameters: {
+  key: "lora_cache_host_memory_bytes"
+  value: {
+    string_value: "${lora_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "decoding_mode"
+  value: {
+    string_value: "${decoding_mode}"
+  }
+}
+parameters: {
+  key: "executor_worker_path"
+  value: {
+    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
+  }
+}
+parameters: {
+  key: "medusa_choices"
+    value: {
+      string_value: "${medusa_choices}"
+  }
+}
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}

tensorrt_llm_bls/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (2.72 kB). View file

tensorrt_llm_bls/1/lib/__pycache__/decode.cpython-310.pyc ADDED Viewed

Binary file (9.05 kB). View file

tensorrt_llm_bls/1/lib/__pycache__/triton_decoder.cpython-310.pyc ADDED Viewed

Binary file (9.73 kB). View file

tensorrt_llm_bls/1/lib/decode.py ADDED Viewed

	@@ -0,0 +1,333 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections.abc import Generator
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+class RequestValidationError(Exception):
+    pass
+def _validate_that(condition: bool, msg: str):
+    if not condition:
+        raise RequestValidationError(msg)
+def _validate_non_empty(data, msg: str):
+    _validate_that(data is not None and data.size > 0, msg)
+def _validate_single_gt_0(data, msg: str):
+    _validate_non_empty(data, msg)
+    _validate_that(data.flatten()[0] > 0, msg)
+def _single_value(data: Optional[np.ndarray]):
+    if data is None:
+        return None
+    return data.flatten()[0]
+@dataclass
+class Request:
+    text_input: np.ndarray = np.array([])
+    decoder_text_input: np.ndarray = None
+    max_tokens: np.ndarray = np.array([])
+    bad_words: Optional[np.ndarray] = None
+    stop_words: Optional[np.ndarray] = None
+    end_id: Optional[np.ndarray] = None
+    pad_id: Optional[np.ndarray] = None
+    top_k: Optional[np.ndarray] = None
+    top_p: Optional[np.ndarray] = None
+    temperature: Optional[np.ndarray] = None
+    length_penalty: Optional[np.ndarray] = None
+    repetition_penalty: Optional[np.ndarray] = None
+    min_length: Optional[np.ndarray] = None
+    return_log_probs: Optional[np.ndarray] = None
+    prompt_embedding_table: Optional[np.ndarray] = None
+    prompt_vocab_size: Optional[np.ndarray] = None
+    embedding_bias_words: Optional[np.ndarray] = None
+    embedding_bias_weights: Optional[np.ndarray] = None
+    num_draft_tokens: Optional[np.ndarray] = None
+    use_draft_logits: Optional[np.ndarray] = None
+    stream: Optional[np.ndarray] = None
+    beam_width: Optional[np.ndarray] = None
+    return_context_logits: Optional[np.ndarray] = None
+    return_generation_logits: Optional[np.ndarray] = None
+    random_seed: Optional[np.ndarray] = None
+    presence_penalty: Optional[np.ndarray] = None
+    frequency_penalty: Optional[np.ndarray] = None
+    def validate(self):
+        _validate_non_empty(self.text_input, "text_input is required")
+        _validate_single_gt_0(self.max_tokens,
+                              "max_tokens must be a single value > 0")
+        num_draft_tokens = _single_value(self.num_draft_tokens)
+        stream = _single_value(self.stream)
+        _single_value(self.return_generation_logits)
+        context_logits = _single_value(self.return_context_logits)
+        if num_draft_tokens:
+            _validate_that(
+                not stream,
+                "streaming is not supported with speculative decoding")
+            _validate_that(
+                not context_logits,
+                "context logits are not supported with speculative decoding")
+@dataclass
+class DraftRequest:
+    draft_input_ids: Optional[np.ndarray] = None
+    draft_logits: Optional[np.ndarray] = None
+@dataclass
+class PreprocResponse:
+    input_ids: np.ndarray = np.array([])
+    decoder_input_ids: np.ndarray = None
+    input_lengths: np.ndarray = np.array([])
+    decoder_input_lengths: np.ndarray = None
+    bad_words_list: Optional[np.ndarray] = None
+    stop_words_list: Optional[np.ndarray] = None
+    embedding_bias: Optional[np.ndarray] = None
+    end_id: Optional[np.ndarray] = None
+    pad_id: Optional[np.ndarray] = None
+    @classmethod
+    def with_new_inputs(cls,
+                        other,
+                        input_ids: Optional[np.ndarray] = None,
+                        input_lengths: Optional[np.ndarray] = None):
+        return cls(
+            input_ids=(input_ids
+                       if input_ids is not None else other.input_ids),
+            input_lengths=(input_lengths if input_lengths is not None else
+                           other.input_lengths),
+            decoder_input_ids=other.decoder_input_ids,
+            decoder_input_lengths=other.decoder_input_lengths,
+            bad_words_list=other.bad_words_list,
+            stop_words_list=other.stop_words_list,
+            end_id=other.end_id,
+            pad_id=other.pad_id,
+        )
+@dataclass
+class GenerationResponse:
+    output_ids: np.ndarray = np.array([])
+    sequence_length: np.ndarray = np.array([])
+    cum_log_probs: Optional[np.ndarray] = None
+    output_log_probs: Optional[np.ndarray] = None
+    context_logits: Optional[np.ndarray] = None
+    generation_logits: Optional[np.ndarray] = None
+@dataclass
+class Response:
+    text_output: np.ndarray = np.array([])
+    cum_log_probs: Optional[np.ndarray] = None
+    output_log_probs: Optional[np.ndarray] = None
+    context_logits: Optional[np.ndarray] = None
+    generation_logits: Optional[np.ndarray] = None
+    def __eq__(self, o) -> bool:
+        """Just for testing"""
+        if not isinstance(o, Response):
+            return False
+        return (np.array_equal(self.text_output, o.text_output)
+                and np.array_equal(self.cum_log_probs, o.cum_log_probs)
+                and np.array_equal(self.output_log_probs, o.output_log_probs)
+                and np.array_equal(self.context_logits, o.context_logits) and
+                np.array_equal(self.generation_logits, o.generation_logits))
+class Decoder:
+    def __init__(self, streaming=False, accumulate=False):
+        self._streaming = streaming
+        self._accumulate = accumulate
+        self._accumulated_tokens = None
+    def decode(self,
+               request: Request,
+               speculative_decoding=False) -> Generator[Response, None, None]:
+        preproc_response = self.preprocess(request)
+        if speculative_decoding:
+            for gen_response in self._spec_generate(preproc_response, request):
+                yield self.postprocess(gen_response)
+        else:
+            if not self._streaming:
+                gen_response = self._generate_non_streaming(
+                    preproc_response, request)
+                yield self.postprocess(gen_response)
+            else:
+                for gen_response in self._generate(preproc_response, request):
+                    yield self.postprocess(gen_response)
+    def encountered_stop_words(self, input_ids, stop_words_ids):
+        for stop_word_ids in stop_words_ids:
+            if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
+                return True
+        return False
+    def _spec_generate(
+            self, preproc: PreprocResponse,
+            request: Request) -> Generator[GenerationResponse, None, None]:
+        prompt_input_ids: np.ndarray = preproc.input_ids[0]
+        input_ids: np.ndarray = prompt_input_ids
+        output_len: int = request.max_tokens[0][0]
+        last_input_ids: np.ndarray = None
+        draft_output_ids: np.ndarray = None
+        draft_logits: np.ndarray = None
+        target_response: GenerationResponse = None
+        cur_preproc = preproc
+        counter = 0
+        while True:
+            counter += 1
+            num_draft_tokens = min(
+                request.num_draft_tokens[0][0],
+                len(prompt_input_ids) + output_len - len(input_ids) - 1)
+            draft_request = None
+            if num_draft_tokens > 0:
+                draft_response: GenerationResponse = self._draft_generate_non_streaming(
+                    cur_preproc, request, num_draft_tokens)
+                seq_len: int = draft_response.sequence_length[0][0]
+                # [1, beamWidth, outputLength] -> [outputLen]
+                draft_output_ids = draft_response.output_ids[0][0]
+                # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
+                if request.use_draft_logits is not None and request.use_draft_logits[
+                        0]:
+                    if draft_response.generation_logits is not None:
+                        draft_logits = draft_response.generation_logits[0][0]
+                input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
+                draft_request = DraftRequest(
+                    draft_input_ids=np.expand_dims(input_draft_tokens, 0))
+                if request.use_draft_logits is not None and request.use_draft_logits[
+                        0]:
+                    draft_request.draft_logits = np.expand_dims(
+                        draft_logits[-len(input_draft_tokens):], 0)
+            else:
+                draft_request = DraftRequest()
+            target_response = self._generate_non_streaming(
+                cur_preproc, request, draft_request)
+            last_input_ids = input_ids
+            input_ids = target_response.output_ids[0][0]
+            cur_preproc = PreprocResponse.with_new_inputs(
+                cur_preproc, np.expand_dims(input_ids, 0),
+                np.array([[len(input_ids)]], dtype=np.int32))
+            # Evaluate criteria to stop generation loop.
+            # If we've hit or exceeded the max output length, should stop
+            length_stop = (len(input_ids) >=
+                           len(prompt_input_ids) + output_len)
+            if length_stop:
+                break
+            # If draft and target have same outputs, should stop. Normally target should return 1 more token.
+            # If they are the same length, they should differ at the last token
+            target_draft_equal = draft_output_ids is not None and np.array_equal(
+                draft_output_ids, input_ids)
+            if target_draft_equal:
+                break
+            # If tokens no longer change, should stop, means we have hit early stopping
+            last_current_equal = np.array_equal(last_input_ids, input_ids)
+            if last_current_equal:
+                break
+            # Need to check if stop words was encountered
+            hit_stop_words = self.encountered_stop_words(
+                input_ids, preproc.stop_words_list[0])
+            if hit_stop_words:
+                break
+        yield target_response
+    def _draft_generate_non_streaming(
+            self, preproc: PreprocResponse, request: Request,
+            num_draft_tokens: int) -> GenerationResponse:
+        raise NotImplementedError()
+    def _generate(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None
+    ) -> Generator[GenerationResponse, None, None]:
+        raise NotImplementedError()
+    def _generate_non_streaming(
+            self,
+            preproc: PreprocResponse,
+            request: Request,
+            draft_request: Optional[DraftRequest] = None
+    ) -> GenerationResponse:
+        raise NotImplementedError()
+    def postprocess(self, gen_response: GenerationResponse) -> Response:
+        if self._accumulate and self._streaming:
+            new_tokens: np.ndarray = gen_response.output_ids
+            if new_tokens.ndim != 3:
+                raise Exception("Expected output_ids tensor to have 3 dims.")
+            if new_tokens.shape[0] != 1:
+                raise Exception("Expected batch size of 1")
+            if new_tokens.shape[1] != 1:
+                raise Exception(
+                    "Accumulation of tokens is only implemented for beam width = 1"
+                )
+            self._accumulated_tokens = new_tokens if (
+                self._accumulated_tokens is None) else np.concatenate(
+                    (self._accumulated_tokens, new_tokens), axis=2)
+            sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]],
+                                        dtype=np.int32)
+            return self._postprocess(self._accumulated_tokens,
+                                     sequence_lengths, gen_response)
+        else:
+            return self._postprocess(gen_response.output_ids, None,
+                                     gen_response)
+    def _postprocess(self, tokens: np.ndarray,
+                     sequence_lengths: Optional[np.ndarray],
+                     gen_response: GenerationResponse) -> Response:
+        raise NotImplementedError()
+    def preprocess(self, request: Request) -> PreprocResponse:
+        raise NotImplementedError()
+    def reset_decoder(self):
+        self._accumulated_tokens = None

tensorrt_llm_bls/1/lib/triton_decoder.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections.abc import Callable
+from typing import Dict, Optional
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from lib.decode import *
+from typing_extensions import override
+class TritonDecoder(Decoder):
+    def __init__(self,
+                 streaming=False,
+                 accumulate=False,
+                 preproc_model_name="preprocessing",
+                 postproc_model_name="postprocessing",
+                 llm_model_name="tensorrt_llm",
+                 draft_llm_model_name: Optional[str] = None):
+        super().__init__(streaming=streaming, accumulate=accumulate)
+        self.preproc_model_name = preproc_model_name
+        self.postproc_model_name = postproc_model_name
+        self.llm_model_name = llm_model_name
+        self.draft_llm_model_name = draft_llm_model_name
+        self._preproc_outputs = [
+            "INPUT_ID",
+            "DECODER_INPUT_ID",
+            "REQUEST_INPUT_LEN",
+            "REQUEST_DECODER_INPUT_LEN",
+            "BAD_WORDS_IDS",
+            "STOP_WORDS_IDS",
+            "EMBEDDING_BIAS",
+            "OUT_PAD_ID",
+            "OUT_END_ID",
+        ]
+        self._llm_outputs = [
+            "output_ids",
+            "sequence_length",
+            "cum_log_probs",
+            "output_log_probs",
+            "context_logits",
+            "generation_logits",
+        ]
+        self._postproc_outputs = [
+            "OUTPUT",
+        ]
+        self.input_names = [
+            "text_input",
+            "decoder_text_input",
+            "max_tokens",
+            "bad_words",
+            "stop_words",
+            "end_id",
+            "pad_id",
+            "top_k",
+            "top_p",
+            "temperature",
+            "length_penalty",
+            "repetition_penalty",
+            "min_length",
+            "presence_penalty",
+            "frequency_penalty",
+            "random_seed",
+            "return_log_probs",
+            "return_context_logits",
+            "return_generation_logits",
+            "beam_width",
+            "stream",
+            "prompt_embedding_table",
+            "prompt_vocab_size",
+            "embedding_bias_words",
+            "embedding_bias_weights",
+            "num_draft_tokens",
+            "use_draft_logits",
+        ]
+        self.__undo_reshape_whitelist = {
+            "max_tokens",
+            "end_id",
+            "pad_id",
+            "top_k",
+            "top_p",
+            "temperature",
+            "length_penalty",
+            "repetition_penalty",
+            "min_length",
+            "presence_penalty",
+            "frequency_penalty",
+            "random_seed",
+            "return_log_probs",
+            "return_context_logits",
+            "return_generation_logits",
+            "beam_width",
+            "stream",
+            "prompt_vocab_size",
+            "num_draft_tokens",
+            "use_draft_logits",
+        }
+    def _exec_triton_request(self, request):
+        responses = request.exec(decoupled=True)
+        for r in responses:
+            if r.has_error():
+                raise pb_utils.TritonModelException(r.error().message())
+            yield r
+    def _exec_triton_request_single(self, request):
+        responses = request.exec(decoupled=False)
+        if responses.has_error():
+            raise pb_utils.TritonModelException(responses.error().message())
+        return responses
+    def create_triton_response(self, response: Response):
+        name_map = {
+            "text_output": "text_output",
+            "cum_log_probs": "cum_log_probs",
+            "output_log_probs": "output_log_probs",
+            "context_logits": "context_logits",
+            "generation_logits": "generation_logits"
+        }
+        tensors = self.create_triton_tensors(response, name_map)
+        return pb_utils.InferenceResponse(output_tensors=tensors)
+    def convert_triton_request(self, triton_request) -> Request:
+        request = Request()
+        for triton_name in self.input_names:
+            tensor = pb_utils.get_input_tensor_by_name(triton_request,
+                                                       triton_name)
+            target_name = triton_name
+            if tensor is None:
+                continue
+            if not hasattr(request, target_name):
+                raise AttributeError(
+                    f"Request has no attribute '{target_name}'")
+            setattr(request, target_name, tensor.as_numpy())
+        return request
+    def convert_triton_response(self,
+                                triton_response,
+                                response_factory: Callable,
+                                name_map=None):
+        response = response_factory()
+        for tensor in triton_response.output_tensors():
+            if tensor is None:
+                continue
+            triton_name = tensor.name()
+            value = tensor.as_numpy()
+            target_name = triton_name
+            if name_map and triton_name in name_map:
+                target_name = name_map[triton_name]
+            if name_map and not triton_name in name_map:
+                continue
+            if target_name is None:
+                # explicitly ignore this triton input
+                continue
+            if not hasattr(response, target_name):
+                raise AttributeError(
+                    f"response object has not attribute '{target_name}'")
+            setattr(response, target_name, value)
+        return response
+    def __undo_reshape(self, x, name):
+        if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
+            # handle reshapes
+            return np.expand_dims(x, 0)
+        else:
+            return x
+    def create_triton_tensors(self, obj, name_map: dict):
+        tensors = []
+        for name, triton_name in name_map.items():
+            if triton_name is None:
+                continue
+            value = getattr(obj, name)
+            if value is None:
+                continue
+            t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name))
+            tensors.append(t)
+        return tensors
+    @override
+    def preprocess(self, request: Request) -> PreprocResponse:
+        input_tensors = self._get_preproc_tensors(request)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.preproc_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._preproc_outputs)
+        triton_output = self._exec_triton_request_single(triton_req)
+        return self._get_preproc_response(triton_output)
+    def _get_preproc_tensors(self, request: Request):
+        name_map = {
+            "text_input": "QUERY",
+            "decoder_text_input": "DECODER_QUERY",
+            "max_tokens": "REQUEST_OUTPUT_LEN",
+            "bad_words": "BAD_WORDS_DICT",
+            "stop_words": "STOP_WORDS_DICT",
+            "embedding_bias_words": "EMBEDDING_BIAS_WORDS",
+            "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
+            "pad_id": "PAD_ID",
+            "end_id": "END_ID",
+        }
+        return self.create_triton_tensors(request, name_map)
+    def _get_preproc_response(self, triton_output):
+        name_map = {
+            "INPUT_ID": "input_ids",
+            "DECODER_INPUT_ID": "decoder_input_ids",
+            "REQUEST_INPUT_LEN": "input_lengths",
+            "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
+            "BAD_WORDS_IDS": "bad_words_list",
+            "STOP_WORDS_IDS": "stop_words_list",
+            "EMBEDDING_BIAS": "embedding_bias",
+            "OUT_PAD_ID": "pad_id",
+            "OUT_END_ID": "end_id",
+        }
+        return self.convert_triton_response(triton_output, PreprocResponse,
+                                            name_map)
+    @override
+    def _draft_generate_non_streaming(
+            self, preproc: PreprocResponse, request: Request,
+            num_draft_tokens: int) -> GenerationResponse:
+        input_tensors = self._get_llm_tensors(preproc, request,
+                                              num_draft_tokens, None, True)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.draft_llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        triton_response = self._exec_triton_request_single(triton_req)
+        llm_response = self._get_llm_response(triton_response)
+        return llm_response
+    @override
+    def _generate(
+        self,
+        preproc: PreprocResponse,
+        request: Request,
+        draft_request: Optional[DraftRequest] = None
+    ) -> Generator[GenerationResponse, None, None]:
+        input_tensors = self._get_llm_tensors(preproc, request, None,
+                                              draft_request)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        for r in self._exec_triton_request(triton_req):
+            yield self._get_llm_response(r)
+    @override
+    def _generate_non_streaming(
+            self,
+            preproc: PreprocResponse,
+            request: Request,
+            draft_request: Optional[DraftRequest] = None
+    ) -> GenerationResponse:
+        input_tensors = self._get_llm_tensors(preproc, request, None,
+                                              draft_request)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.llm_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._llm_outputs)
+        r = self._exec_triton_request_single(triton_req)
+        return self._get_llm_response(r)
+    def _get_llm_tensors(self,
+                         preproc: PreprocResponse,
+                         request: Request,
+                         num_output_tokens: Optional[int] = None,
+                         draft_request: Optional[DraftRequest] = None,
+                         is_draft_model_request: bool = False):
+        tensors = []
+        tensors.extend(self._get_tensors_from_preproc(preproc))
+        tensors.extend(
+            self._get_llm_tensors_from_request(request, num_output_tokens,
+                                               draft_request,
+                                               is_draft_model_request))
+        return tensors
+    def _get_tensors_from_preproc(self, preproc: PreprocResponse):
+        name_map = {
+            "input_ids": "input_ids",
+            "decoder_input_ids": "decoder_input_ids",
+            "input_lengths": "input_lengths",
+            "bad_words_list": "bad_words_list",
+            "stop_words_list": "stop_words_list",
+            "embedding_bias": "embedding_bias",
+            "pad_id": "pad_id",
+            "end_id": "end_id",
+        }
+        return self.create_triton_tensors(preproc, name_map)
+    def _get_llm_tensors_from_request(
+            self,
+            request: Request,
+            num_output_tokens: Optional[int] = None,
+            draft_request: Optional[DraftRequest] = None,
+            is_draft_model_request: bool = False):
+        name_map: Dict[str, Optional[str]] = {
+            "beam_width": "beam_width",
+            "top_k": "runtime_top_k",
+            "top_p": "runtime_top_p",
+            "length_penalty": "len_penalty",
+            "repetition_penalty": "repetition_penalty",
+            "min_length": "min_length",
+            "presence_penalty": "presence_penalty",
+            "frequency_penalty": "frequency_penalty",
+            "random_seed": "random_seed",
+            "return_log_probs": "return_log_probs",
+            "stream": "streaming",
+            "prompt_embedding_table": "prompt_embedding_table",
+            "prompt_vocab_size": "prompt_vocab_size",
+        }
+        tensors = self.create_triton_tensors(request, name_map)
+        out_len = request.max_tokens[0][0] if request.max_tokens else None
+        if num_output_tokens is not None:
+            out_len = num_output_tokens
+        elif draft_request:
+            if draft_request.draft_input_ids is not None:
+                out_len = len(draft_request.draft_input_ids[0]) + 1
+            else:
+                out_len = 1
+        if out_len is None:
+            raise Exception("Could not determine request_output_len")
+        else:
+            tensors.append(
+                pb_utils.Tensor("request_output_len",
+                                np.array([[out_len]], dtype=np.int32)))
+        if draft_request:
+            if draft_request.draft_input_ids is not None:
+                tensors.append(
+                    pb_utils.Tensor("draft_input_ids",
+                                    draft_request.draft_input_ids))
+                if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
+                        0]:
+                    tensors.append(
+                        pb_utils.Tensor("draft_logits",
+                                        draft_request.draft_logits))
+        return_context_logits = False
+        return_generation_logits = False
+        if draft_request is None:
+            if is_draft_model_request:
+                return_generation_logits = request.use_draft_logits[
+                    0] if request.use_draft_logits is not None else False
+            else:
+                return_context_logits = request.return_context_logits[
+                    0] if request.return_context_logits is not None else False
+                return_generation_logits = request.return_generation_logits[
+                    0] if request.return_generation_logits is not None else False
+        tensors.append(
+            pb_utils.Tensor("return_context_logits",
+                            np.array([[return_context_logits]])))
+        tensors.append(
+            pb_utils.Tensor("return_generation_logits",
+                            np.array([[return_generation_logits]])))
+        return tensors
+    def _get_llm_response(self, triton_output):
+        name_map = {
+            "output_ids": "output_ids",
+            "sequence_length": "sequence_length",
+            "cum_log_probs": "cum_log_probs",
+            "output_log_probs": "output_log_probs",
+            "context_logits": "context_logits",
+            "generation_logits": "generation_logits",
+        }
+        return self.convert_triton_response(triton_output, GenerationResponse,
+                                            name_map)
+    def _postprocess(self, tokens: np.ndarray,
+                     sequence_lengths: Optional[np.ndarray],
+                     gen_response: GenerationResponse) -> Response:
+        input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
+                                                   gen_response)
+        triton_req = pb_utils.InferenceRequest(
+            model_name=self.postproc_model_name,
+            inputs=input_tensors,
+            requested_output_names=self._postproc_outputs)
+        r = self._exec_triton_request_single(triton_req)
+        response = self._get_response(r, gen_response)
+        return response
+    def _get_postproc_tensors(self, tokens: np.ndarray,
+                              sequence_lengths: Optional[np.ndarray],
+                              gen_response: GenerationResponse):
+        tensors = [
+            pb_utils.Tensor("TOKENS_BATCH", tokens),
+            pb_utils.Tensor(
+                "SEQUENCE_LENGTH", sequence_lengths
+                if sequence_lengths else gen_response.sequence_length)
+        ]
+        return tensors
+    def _get_response(self, triton_output, gen_res: GenerationResponse):
+        tensors = triton_output.output_tensors()
+        t_map = {}
+        for named_t in tensors:
+            name = named_t.name()
+            t = named_t.as_numpy()
+            t_map[name] = t
+        response = Response(text_output=t_map["OUTPUT"],
+                            cum_log_probs=gen_res.cum_log_probs,
+                            output_log_probs=gen_res.output_log_probs,
+                            context_logits=gen_res.context_logits,
+                            generation_logits=gen_res.generation_logits)
+        return response

tensorrt_llm_bls/1/model.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import traceback
+import triton_python_backend_utils as pb_utils
+from lib.triton_decoder import TritonDecoder
+class TritonPythonModel:
+    def initialize(self, args):
+        # Parse model configs
+        model_config = json.loads(args['model_config'])
+        params = model_config['parameters']
+        accumulate_tokens_str = ''
+        if 'accumulate_tokens' in params:
+            accumulate_tokens_str = params['accumulate_tokens']['string_value']
+        self.accumulate_tokens = accumulate_tokens_str.lower() in [
+            'true', 'yes', '1', 't'
+        ]
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config)
+        self.logger = pb_utils.Logger
+        self.llm_model_name = "tensorrt_llm"
+        if "tensorrt_llm_model_name" in params:
+            self.llm_model_name = params["tensorrt_llm_model_name"][
+                "string_value"]
+        self.draft_llm_model_name = None
+        if "tensorrt_llm_draft_model_name" in params:
+            self.draft_llm_model_name = params[
+                "tensorrt_llm_draft_model_name"]["string_value"]
+        self.decoder = TritonDecoder(
+            streaming=self.decoupled,
+            accumulate=self.accumulate_tokens,
+            preproc_model_name="preprocessing",
+            postproc_model_name="postprocessing",
+            llm_model_name=self.llm_model_name,
+            draft_llm_model_name=self.draft_llm_model_name)
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            if self.decoupled:
+                response_sender = request.get_response_sender()
+            try:
+                req = self.decoder.convert_triton_request(request)
+                req.validate()
+                speculative_decode = (req.num_draft_tokens is not None
+                                      and req.num_draft_tokens[0][0] > 0)
+                if speculative_decode and (self.draft_llm_model_name is None
+                                           or self.draft_llm_model_name == ""):
+                    raise Exception(
+                        "cannot perform speculative decoding without draft model"
+                    )
+                res_gen = self.decoder.decode(
+                    req, speculative_decoding=speculative_decode)
+                for res in res_gen:
+                    triton_response = self.decoder.create_triton_response(res)
+                    if self.decoupled:
+                        response_sender.send(triton_response)
+                    else:
+                        responses.append(triton_response)
+                if self.decoupled:
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            except Exception:
+                self.logger.log_error(traceback.format_exc())
+                # If encountering an error, send a response with err msg
+                error_response = pb_utils.InferenceResponse(
+                    output_tensors=[],
+                    error=pb_utils.TritonError(traceback.format_exc()))
+                if self.decoupled:
+                    response_sender.send(error_response)
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    responses.append(error_response)
+            self.decoder.reset_decoder()
+            if self.decoupled:
+                return None
+            else:
+                assert len(responses) == len(requests)
+                return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

tensorrt_llm_bls/config.pbtxt ADDED Viewed

	@@ -0,0 +1,253 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm_bls"
+backend: "python"
+max_batch_size: 16
+model_transaction_policy {
+  decoupled: true
+}
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "decoder_text_input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "max_tokens"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+   name: "bad_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+   name: "stop_words"
+   data_type: TYPE_STRING
+   dims: [ -1 ]
+   optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "length_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "stream"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+      name: "embedding_bias_words"
+      data_type: TYPE_STRING
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "embedding_bias_weights"
+      data_type: TYPE_FP32
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "num_draft_tokens",
+      data_type: TYPE_INT32,
+      dims: [ 1 ]
+      optional: true
+  },
+  {
+      name: "use_draft_logits",
+      data_type: TYPE_BOOL,
+      dims: [ 1 ]
+      reshape: { shape: [ ] }
+      optional: true
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  }
+]
+parameters: {
+  key: "accumulate_tokens"
+  value: {
+    string_value: "${accumulate_tokens}"
+  }
+}
+parameters: {
+  key: "tensorrt_llm_model_name"
+  value: {
+    string_value: "${tensorrt_llm_model_name}"
+  }
+}
+parameters: {
+  key: "tensorrt_llm_draft_model_name"
+  value: {
+    string_value: "${tensorrt_llm_draft_model_name}"
+  }
+}
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]