File size: 2,457 Bytes
247c739
396d643
96fe271
247c739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816f303
094ab01
247c739
 
 
 
 
 
 
 
 
 
396d643
 
247c739
15cdada
 
 
 
396d643
 
15cdada
 
247c739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import time
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

class EndpointHandler:
    """
    Custom handler for `Qwen/Qwen2.5-Math-7B-Instruct`.
    """
    def __init__(self, path=""):
        """
        Initialize model and tokenizer.
        :param path: Path to model and tokenizer
        """
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype="auto", device_map="auto")

    def __call__(self, data: dict):
        """
        Execute model based on input data.

        :param data: Input parameters for the model.
            Should be in the following form:
            `{"inputs": "input_string", "parameters": {"parameter_1": 0, "parameter_2": 0}}`

        :return: dict (answer, num_new_token, speed)
        """

        question = data.get("inputs", None)
        max_new_tokens = data.get("max_new_tokens", 1024)
        parameters = data.get("parameters", {})

        if not question:
            raise ValueError("Input prompt is missing.")

        messages = [
            {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
            {"role": "user", "content": question + " Then, give your confidence level in percentage regarding your answer."}
        ]

        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        model_inputs = self.tokenizer([text], return_tensors="pt").to("cuda")

        torch.manual_seed(random.randint(0, 2 ** 32 - 1))

        time_start = time.time()
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=max_new_tokens,
            temperature=1.0,
            do_sample=True,
            top_p=0.9,
            **parameters
        )
        time_end = time.time()

        num_new_tokens = len(generated_ids[0]) - len(model_inputs[0])

        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in
                         zip(model_inputs.input_ids, generated_ids)]

        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        speed = num_new_tokens / (time_end - time_start)

        return {
            "answer": response,
            "num_new_tokens": num_new_tokens,
            "speed": speed
        }