cyqm commited on
Commit
247c739
·
1 Parent(s): f1f11bd

Upload handler

Browse files
Files changed (1) hide show
  1. handler.py +66 -0
handler.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+
4
+ class EndpointHandler:
5
+ """
6
+ Custom handler for `Qwen/Qwen2.5-Math-7B-Instruct`.
7
+ """
8
+ def __init__(self, path=""):
9
+ """
10
+ Initialize model and tokenizer.
11
+ :param path: Path to model and tokenizer
12
+ """
13
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
14
+ self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype="auto", device_map="auto")
15
+
16
+ def __call__(self, data: dict):
17
+ """
18
+ Execute model based on input data.
19
+
20
+ :param data: Input parameters for the model.
21
+ Should be in the following form:
22
+ `{"inputs": "input_string", "parameters": {"parameter_1": 0, "parameter_2": 0}}`
23
+
24
+ :return: dict (answer, num_new_token, speed)
25
+ """
26
+
27
+ question = data.get("inputs", None)
28
+ max_new_tokens = data.get("max_new_tokens", 1024)
29
+ parameters = data.get("parameters", {})
30
+
31
+ if not question:
32
+ raise ValueError("Input prompt is missing.")
33
+
34
+ messages = [
35
+ {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}. "
36
+ "Then, give your confidence level regarding your answer."},
37
+ {"role": "user", "content": question}
38
+ ]
39
+
40
+ text = self.tokenizer.apply_chat_template(
41
+ messages,
42
+ tokenize=False,
43
+ add_generation_prompt=True
44
+ )
45
+
46
+ model_inputs = self.tokenizer([text], return_tensors="pt").to("cuda")
47
+
48
+ time_start = time.time()
49
+ generated_ids = self.model.generate(**model_inputs, max_new_tokens=max_new_tokens)
50
+ time_end = time.time()
51
+
52
+ num_new_tokens = len(generated_ids[0]) - len(model_inputs[0])
53
+
54
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in
55
+ zip(model_inputs.input_ids, generated_ids)]
56
+
57
+ response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
58
+
59
+ speed = num_new_tokens / (time_end - time_start)
60
+
61
+ return {
62
+ "answer": response,
63
+ "num_new_tokens": num_new_tokens,
64
+ "speed": speed
65
+ }
66
+