File size: 2,457 Bytes
247c739 396d643 96fe271 247c739 816f303 094ab01 247c739 396d643 247c739 15cdada 396d643 15cdada 247c739 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import time
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
class EndpointHandler:
"""
Custom handler for `Qwen/Qwen2.5-Math-7B-Instruct`.
"""
def __init__(self, path=""):
"""
Initialize model and tokenizer.
:param path: Path to model and tokenizer
"""
self.tokenizer = AutoTokenizer.from_pretrained(path)
self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype="auto", device_map="auto")
def __call__(self, data: dict):
"""
Execute model based on input data.
:param data: Input parameters for the model.
Should be in the following form:
`{"inputs": "input_string", "parameters": {"parameter_1": 0, "parameter_2": 0}}`
:return: dict (answer, num_new_token, speed)
"""
question = data.get("inputs", None)
max_new_tokens = data.get("max_new_tokens", 1024)
parameters = data.get("parameters", {})
if not question:
raise ValueError("Input prompt is missing.")
messages = [
{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
{"role": "user", "content": question + " Then, give your confidence level in percentage regarding your answer."}
]
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = self.tokenizer([text], return_tensors="pt").to("cuda")
torch.manual_seed(random.randint(0, 2 ** 32 - 1))
time_start = time.time()
generated_ids = self.model.generate(
**model_inputs,
max_new_tokens=max_new_tokens,
temperature=1.0,
do_sample=True,
top_p=0.9,
**parameters
)
time_end = time.time()
num_new_tokens = len(generated_ids[0]) - len(model_inputs[0])
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in
zip(model_inputs.input_ids, generated_ids)]
response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
speed = num_new_tokens / (time_end - time_start)
return {
"answer": response,
"num_new_tokens": num_new_tokens,
"speed": speed
}
|