cyqm
Update handler: delete debug info
439dfb9
import time
import torch
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
class EndpointHandler:
"""
Custom handler for `Qwen/Qwen2.5-Math-7B-Instruct`.
"""
def __init__(self, path=""):
"""
Initialize model and tokenizer.
:param path: Path to model and tokenizer
"""
self.tokenizer = AutoTokenizer.from_pretrained(path)
self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map="auto")
def __call__(self, data: dict):
"""
Execute model based on input data.
:param data: Input parameters for the model.
Should be in the following form:
`{"inputs": "input_string", "parameters": {"parameter_1": 0, "parameter_2": 0}}`
:return: dict (answer, num_new_token, speed)
"""
question = data.get("inputs", None)
max_new_tokens = data.get("max_new_tokens", 1024)
parameters = data.get("parameters", {})
if not question:
raise ValueError("Input prompt is missing.")
messages = [
{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}. "
"Then, give your confidence level in percentage regarding your answer."},
{"role": "user", "content": question}
]
tokenized_prompt = self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to("cuda")
torch.manual_seed(random.randint(0, 2 ** 32 - 1))
time_start = time.time()
out = self.model.generate(
tokenized_prompt,
max_new_tokens=max_new_tokens,
temperature=1.0,
do_sample=True,
top_p=0.9,
**parameters
)
time_end = time.time()
response = self.tokenizer.decode(out[0][len(tokenized_prompt[0]):])
num_new_tokens = len(out[0]) - len(tokenized_prompt[0])
speed = num_new_tokens / (time_end - time_start)
return {
"answer": response,
"num_new_tokens": num_new_tokens,
"speed": speed
}