from typing import Dict, List, Any from unsloth import FastLanguageModel class EndpointHandler(): def __init__(self, path=""): # Preload all the elements you are going to need at inference. # pseudo: # self.model= load_model(path) model, tokenizer = FastLanguageModel.from_pretrained( model_name = "aidando73/llama-3.3-70b-instruct-code-agent-fine-tune-v1", max_seq_length = 2048, dtype = "float16", load_in_4bit = True, ) FastLanguageModel.for_inference(model) self.model = model self.tokenizer = tokenizer def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str` | `PIL.Image` | `np.array`) kwargs Return: A :obj:`list` | `dict`: will be serialized and returned """ input_ids = self.tokenizer.encode(data["inputs"], return_tensors = "pt").to("cuda") output = self.model.generate(input_ids, max_new_tokens = 128, pad_token_id = self.tokenizer.eos_token_id) return [{"output": self.tokenizer.decode(output[0], skip_special_tokens = True)}]