|
import time |
|
import torch |
|
import random |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
class EndpointHandler: |
|
""" |
|
Custom handler for `Qwen/Qwen2.5-Math-7B-Instruct`. |
|
""" |
|
def __init__(self, path=""): |
|
""" |
|
Initialize model and tokenizer. |
|
:param path: Path to model and tokenizer |
|
""" |
|
self.tokenizer = AutoTokenizer.from_pretrained(path) |
|
self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map="auto") |
|
|
|
def __call__(self, data: dict): |
|
""" |
|
Execute model based on input data. |
|
|
|
:param data: Input parameters for the model. |
|
Should be in the following form: |
|
`{"inputs": "input_string", "parameters": {"parameter_1": 0, "parameter_2": 0}}` |
|
|
|
:return: dict (answer, num_new_token, speed) |
|
""" |
|
|
|
question = data.get("inputs", None) |
|
max_new_tokens = data.get("max_new_tokens", 1024) |
|
parameters = data.get("parameters", {}) |
|
|
|
if not question: |
|
raise ValueError("Input prompt is missing.") |
|
|
|
messages = [ |
|
{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}. " |
|
"Then, give your confidence level in percentage regarding your answer."}, |
|
{"role": "user", "content": question} |
|
] |
|
|
|
tokenized_prompt = self.tokenizer.apply_chat_template( |
|
messages, |
|
add_generation_prompt=True, |
|
return_tensors="pt" |
|
).to("cuda") |
|
|
|
torch.manual_seed(random.randint(0, 2 ** 32 - 1)) |
|
|
|
time_start = time.time() |
|
out = self.model.generate( |
|
tokenized_prompt, |
|
max_new_tokens=max_new_tokens, |
|
temperature=1.0, |
|
do_sample=True, |
|
top_p=0.9, |
|
**parameters |
|
) |
|
time_end = time.time() |
|
|
|
response = self.tokenizer.decode(out[0][len(tokenized_prompt[0]):]) |
|
|
|
num_new_tokens = len(out[0]) - len(tokenized_prompt[0]) |
|
|
|
speed = num_new_tokens / (time_end - time_start) |
|
|
|
return { |
|
"answer": response, |
|
"num_new_tokens": num_new_tokens, |
|
"speed": speed |
|
} |
|
|