hivata
/

functionary-small-v3.2-AWQ

Safetensors

llama

custom_code

4-bit precision

awq

Model card Files Files and versions Community

LHC88 commited on Aug 10, 2024

Commit

3f2dddb

1 Parent(s): 5b38d1b

tokenizsation_function

Browse files

Files changed (1) hide show

tokenization_functionary.py +537 -0

tokenization_functionary.py ADDED Viewed

	@@ -0,0 +1,537 @@

+# Copyright (c) 2024, MeetKai Inc. All rights reserved.
+from copy import deepcopy
+import json
+from typing import Any, Dict, List, Literal, Optional, Union
+import jsonref
+from pydantic import BaseModel, Field, model_validator
+from typing_extensions import Self
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+SYSTEM_PROMPT = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
+CODE_INTERPRETER_SYSTEM_PROMPT = """When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."""
+class Function(BaseModel):
+    name: str
+    description: Optional[str] = Field(default="")
+    parameters: Optional[dict] = None
+class Tool(BaseModel):
+    type: Literal["function", "code_interpreter"]
+    function: Optional[Function] = None
+    @model_validator(mode="after")
+    def check_type_function_matches(self) -> Self:
+        if self.type == "function":
+            assert (
+                self.function is not None
+            ), '"function" must contain function description when `"type": "function"`'
+        else:
+            assert (
+                self.function is None
+            ), '"function" must not be provided when `"type": "code_interpreter"`'
+        return self
+def convert_data_type(param_type: str) -> str:
+    """convert data_type to typescript data type
+    Args:
+        param_type (str): param_type
+    Returns:
+        str: param type in typescript
+    """
+    if param_type == "integer" or param_type == "float":
+        return "number"
+    return param_type
+def get_param_type(param: Dict) -> str:
+    """get param_type of parameter
+    Args:
+        param (Dict): param dict in properties
+    Returns:
+        str: _description_
+    """
+    param_type = "any"
+    if "type" in param:
+        raw_param_type = param["type"]
+        if type(raw_param_type) is list:
+            param_type = " | ".join(raw_param_type)
+        else:
+            param_type = raw_param_type
+    else:  # in many cases, the json schema contains: oneOf instead of "type"
+        if "oneOf" in param:
+            one_of_types = []
+            for item in param["oneOf"]:
+                if "type" in item:
+                    one_of_types.append(convert_data_type(item["type"]))
+            one_of_types = list(set(one_of_types))
+            param_type = " | ".join(one_of_types)
+    return convert_data_type(param_type)
+def get_format_param(param: Dict) -> Optional[str]:
+    """Get "format" from param. There are cases where format is not directly in param but in oneOf
+    Args:
+        param (Dict): _description_
+    Returns:
+        Optional[str]: _description_
+    """
+    if "format" in param:
+        return param["format"]
+    if "oneOf" in param:
+        formats = []
+        for item in param["oneOf"]:
+            if "format" in item:
+                formats.append(item["format"])
+        if len(formats) > 0:
+            return " or ".join(formats)
+    return None
+def get_param_info(param: Dict) -> Optional[str]:
+    """get additional information about parameter such as: format, default value, min, max, ...
+    Args:
+        param (Dict): _description_
+    Returns:
+        Optional[str]: _description_
+    """
+    param_type = param.get("type", "any")
+    info_list = []
+    if "description" in param:
+        desc = param["description"]
+        if not desc.endswith("."):
+            desc += "."
+        info_list.append(desc)
+    if "default" in param:
+        default_value = param["default"]
+        if param_type == "string":
+            default_value = f'"{default_value}"'  # if string --> add ""
+        info_list.append(f"Default={default_value}.")
+    format_param = get_format_param(param)
+    if format_param is not None:
+        info_list.append("Format=" + format_param)
+    for field, field_name in [
+        ("maximum", "Maximum"),
+        ("minimum", "Minimum"),
+        ("maxLength", "Maximum length"),
+        ("minLength", "Minimum length"),
+    ]:
+        if field in param:
+            info_list.append(f"{field_name}=" + str(param[field]))
+    if len(info_list) > 0:
+        result = "// " + " ".join(info_list)
+        result = result.replace("\n", " ")
+        return result
+    return None
+def append_new_param_info(
+    info_list: List[str],
+    param_declaration: str,
+    comment_info: Optional[str],
+    examples_info: List,
+    depth: int,
+):
+    """Append a new parameter with comment to the info_list
+    Args:
+        info_lines (List[str]): current info_list
+        param_declaration (str): param: type
+        comment_info (Optional[str]): information of comment
+        examples_info (List): information of examples given
+        depth (int): level of nested param
+    """
+    offset = ""
+    if depth >= 1:
+        offset = "".join(["    " for _ in range(depth)])
+    if comment_info is not None:
+        # if depth == 0:  # format: //comment\nparam: type
+        info_list.append(f"{offset}{comment_info}")
+        if len(examples_info) > 0:
+            for example in examples_info:
+                info_list.append(f"{offset}{example}")
+        info_list.append(f"{offset}{param_declaration}")
+    # else:  # format: param: type  // comment
+    #     info_list.append(f"{offset}{param_declaration}    {comment_info}")
+    else:
+        info_list.append(f"{offset}{param_declaration}")
+def get_examples_info(param_name: str, examples: List) -> List:
+    """get information about examples provided
+    Args:
+        param_name (str): _description_
+        examples (List): _description_
+    Returns:
+        List: _description_
+    """
+    examples_list = [f"// Example {param_name}:"]
+    for example in examples:
+        if isinstance(example, dict) or isinstance(example, list):
+            example_str = json.dumps(example, ensure_ascii=False).replace("\n", "\\n")
+        else:
+            example_str = str(example).replace("\n", "\\n")
+        examples_list.append(f"// {example_str}")
+    return examples_list
+def get_enum_option_str(enum_options: List) -> str:
+    """get enum option separated by: "|"
+    Args:
+        enum_options (List): list of options
+    Returns:
+        _type_: concatenation of options separated by "|"
+    """
+    # if each option is string --> add quote
+    return " | ".join([f'"{v}"' if type(v) is str else str(v) for v in enum_options])
+def get_array_typescript(
+    param_name: Optional[str], param_dic: dict, depth: int = 0
+) -> str:
+    """recursive implementation for generating type script of array
+    Args:
+        param_name (Optional[str]): name of param, optional
+        param_dic (dict): param_dic
+        depth (int, optional): nested level. Defaults to 0.
+    Returns:
+        _type_: typescript of array
+    """
+    offset = ""
+    if depth >= 1:
+        offset = "".join(["    " for _ in range(depth)])
+    items_info = param_dic.get("items", {})
+    if len(items_info) == 0:
+        if param_name is not None:
+            return f"{offset}{param_name}: []"
+        else:
+            return "[]"
+    array_type = get_param_type(items_info)
+    if array_type == "object":
+        info_lines = []
+        child_lines = get_parameter_typescript(
+            items_info.get("properties", {}), items_info.get("required", []), depth + 1
+        )
+        # if comment_info is not None:
+        #    info_lines.append(f"{offset}{comment_info}")
+        if param_name is not None:
+            info_lines.append(f"{offset}{param_name}" + ": {")
+        else:
+            info_lines.append(f"{offset}" + "{")
+        info_lines.extend(child_lines)
+        info_lines.append(f"{offset}" + "}[]")
+        return "\n".join(info_lines)
+    elif array_type == "array":
+        item_info = get_array_typescript(None, items_info, depth + 1)
+        if param_name is None:
+            return f"{item_info}[]"
+        return f"{offset}{param_name}: {item_info.strip()}[]"
+    else:
+        if "enum" in items_info:
+            item_type = get_enum_option_str(items_info["enum"])
+            if param_name is None:
+                return f"({item_type})[]"
+            else:
+                return f"{offset}{param_name}: ({item_type})[]"
+        else:
+            if param_name is None:
+                return f"{array_type}[]"
+            else:
+                return f"{offset}{param_name}: {array_type}[],"
+def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
+    """Recursion, returning the information about parameters including data type, description and other information
+    These kinds of information will be put into the prompt
+    Args:
+        properties (_type_): properties in parameters
+        required_params (_type_): List of required parameters
+        depth (int, optional): the depth of params (nested level). Defaults to 0.
+    Returns:
+        _type_: list of lines containing information about all parameters
+    """
+    tp_lines = []
+    for param_name, param in properties.items():
+        # Sometimes properties have "required" field as a list of string.
+        # Even though its supposed to be not under properties. So we skip it
+        if not isinstance(param, dict):
+            continue
+        # Param Description
+        comment_info = get_param_info(param)
+        # Param Examples
+        examples_info = []
+        if "examples" in param:
+            examples_info = get_examples_info(param_name, param["examples"])
+        # Param Name declaration
+        param_declaration = f"{param_name}"
+        if isinstance(required_params, list):
+            if param_name not in required_params:
+                param_declaration += "?"
+        param_type = get_param_type(param)
+        offset = ""
+        if depth >= 1:
+            offset = "".join(["    " for _ in range(depth)])
+        if param_type == "object":  # param_type is object
+            child_lines = get_parameter_typescript(
+                param.get("properties", {}), param.get("required", []), depth + 1
+            )
+            if comment_info is not None:
+                tp_lines.append(f"{offset}{comment_info}")
+            if len(examples_info) > 0:
+                for example in examples_info:
+                    tp_lines.append(f"{offset}{example}")
+            param_declaration += ": {"
+            tp_lines.append(f"{offset}{param_declaration}")
+            tp_lines.extend(child_lines)
+            tp_lines.append(f"{offset}" + "},")
+        elif param_type == "array":  # param_type is an array
+            item_info = param.get("items", {})
+            if "type" not in item_info:  # don't know type of array
+                param_declaration += ": [],"
+                append_new_param_info(
+                    tp_lines, param_declaration, comment_info, examples_info, depth
+                )
+            else:
+                array_declaration = get_array_typescript(
+                    param_declaration, param, depth
+                )
+                if not array_declaration.endswith(","):
+                    array_declaration += ","
+                if comment_info is not None:
+                    tp_lines.append(f"{offset}{comment_info}")
+                if len(examples_info) > 0:
+                    for example in examples_info:
+                        tp_lines.append(f"{offset}{example}")
+                tp_lines.append(array_declaration)
+        else:
+            if "enum" in param:
+                param_type = get_enum_option_str(param["enum"])
+                # param_type = " | ".join([f'"{v}"' for v in param["enum"]])
+            if "nullable" in param and param["nullable"] is True:
+                param_type += " | null"
+            param_declaration += f": {param_type},"
+            append_new_param_info(
+                tp_lines, param_declaration, comment_info, examples_info, depth
+            )
+    return tp_lines
+def generate_schema_from_functions(
+    functions: List[Function], namespace="functions"
+) -> str:
+    """
+    Convert functions schema to a schema that language models can understand.
+    """
+    schema = "// Supported function definitions that should be called when necessary.\n"
+    schema += f"namespace {namespace} {{\n\n"
+    for function in functions:
+        # Convert a Function object to dict, if necessary
+        if not isinstance(function, dict):
+            function = function.model_dump()
+        function_name = function.get("name", None)
+        if function_name is None:
+            continue
+        description = function.get("description", "")
+        schema += f"// {description}\n"
+        schema += f"type {function_name}"
+        parameters = function.get("parameters", None)
+        if parameters is not None and parameters.get("properties") is not None:
+            parameters = deepcopy(jsonref.JsonRef.replace_refs(parameters))
+            schema += " = (_: {\n"
+            required_params = parameters.get("required", [])
+            tp_lines = get_parameter_typescript(
+                parameters.get("properties"),
+                required_params,
+                0,
+            )
+            schema += "\n".join(tp_lines)
+            schema += "\n}) => any;\n\n"
+        else:
+            # Doesn't have any parameters
+            schema += " = () => any;\n\n"
+    schema += f"}} // namespace {namespace}"
+    return schema
+class FunctionaryTokenizer(PreTrainedTokenizerFast):
+    def apply_chat_template(
+        self,
+        conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], str],
+        tools: Optional[List[Dict[str, Any]]],
+        chat_template: Optional[str] = None,
+        add_generation_prompt: bool = False,
+        tokenize: bool = True,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_dict: bool = False,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
+        if return_dict and not tokenize:
+            raise ValueError(
+                "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
+                "of tokenizer outputs to return."
+            )
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        using_default_template = False
+        # First, handle the cases when the model has a dict of multiple templates
+        if isinstance(self.chat_template, dict) or (
+            self.chat_template is None and isinstance(self.default_chat_template, dict)
+        ):
+            if self.chat_template is not None:
+                template_dict = self.chat_template
+                using_default_dict = False
+            else:
+                template_dict = self.default_chat_template
+                using_default_dict = True
+            if chat_template is not None and chat_template in template_dict:
+                # The user can pass the name of a template to the chat template argument instead of an entire template
+                chat_template = template_dict[chat_template]
+                if using_default_dict:
+                    using_default_template = True
+            elif chat_template is None and "default" in template_dict:
+                chat_template = template_dict["default"]
+                if using_default_dict:
+                    using_default_template = True
+            elif chat_template is None:
+                raise ValueError(
+                    "This model has multiple chat templates with no default specified! Please either pass a chat "
+                    "template or the name of the template you wish to use to the `chat_template` argument. Available "
+                    f"template names are {sorted(template_dict.keys())}."
+                )
+        elif chat_template is None:
+            # These are the cases when the model has a single template
+            # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
+            if self.chat_template is not None:
+                chat_template = self.chat_template
+            else:
+                chat_template = self.default_chat_template
+                using_default_template = True
+        if using_default_template:
+            logger.warning_once(
+                "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
+                "very error-prone, because models are often trained with templates different from the class default! "
+                "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
+                "point any code depending on them will stop working. We recommend setting a valid chat template before "
+                "then to ensure that this model continues working without issues."
+            )
+        PYTHON_RUN_SYS_MSG = "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."
+        SYSTEM_CONTENT = """You are capable of executing available function(s) if required.
+Only execute function(s) when absolutely necessary.
+Ask for the required input to:recipient==all
+Use JSON for function arguments.
+Respond in this format:
+>>>${recipient}
+${content}
+Available functions:
+"""
+        # Prepare tools/functions into schema
+        functions_pydantic_to_render = []
+        has_code_interpreter = False
+        if tools is not None:
+            for item in tools:
+                if (
+                    "function" in item and item["function"] is not None
+                ):  #  new data format: tools: [{"type": xx, "function": xxx}]
+                    functions_pydantic_to_render.append(item["function"])
+                elif "type" in item and item["type"] == "code_interpreter":
+                    has_code_interpreter = True
+                else:
+                    functions_pydantic_to_render.append(item)  #  old format
+        conversation.insert(
+            0,
+            {
+                "role": "system",
+                "content": SYSTEM_CONTENT
+                + generate_schema_from_functions(functions_pydantic_to_render),
+            },
+        )
+        if has_code_interpreter:
+            conversation.insert(1, {"role": "system", "content": PYTHON_RUN_SYS_MSG})
+        # Compilation function uses a cache to avoid recompiling the same template
+        compiled_template = self._compile_jinja_template(chat_template)
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple))
+            or hasattr(conversation[0], "messages")
+        ):
+            conversations = conversation
+            is_batched = True
+        else:
+            conversations = [conversation]
+            is_batched = False
+        rendered = []
+        template_kwargs = {
+            **self.special_tokens_map,
+            **kwargs,
+        }  # kwargs overwrite special tokens if both are present
+        for chat in conversations:
+            if hasattr(chat, "messages"):
+                # Indicates it's a Conversation object
+                chat = chat.messages
+            rendered_chat = compiled_template.render(
+                messages=chat,
+                add_generation_prompt=add_generation_prompt,
+                **template_kwargs,
+            )
+            rendered.append(rendered_chat)
+        if not is_batched:
+            rendered = rendered[0]
+        if tokenize:
+            out = self(
+                rendered,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                add_special_tokens=False,
+                return_tensors=return_tensors,
+                **tokenizer_kwargs,
+            )
+            if return_dict:
+                return out
+            else:
+                return out["input_ids"]
+        else:
+            return rendered