MrD05
/

pyg6b

Text Generation

text generation

Inference Endpoints

Model card Files Files and versions Community

pyg6b / handler.py

MrD05's picture

Update handler.py

0d4fb66 almost 2 years ago

3.16 kB

	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline,StoppingCriteria
	from accelerate import init_empty_weights
	from transformers_stream_generator import init_stream_support
	# from langchain.llms import HuggingFacePipeline
	# from langchain import PromptTemplate, LLMChain
	import torch
	import time
	init_stream_support()

	template = """Alice Gate's Persona: Alice Gate is a young, computer engineer-nerd with a knack for problem solving and a passion for technology.
	<START>
	{user_name}: So how did you get into computer engineering?
	Alice Gate: I've always loved tinkering with technology since I was a kid.
	{user_name}: That's really impressive!
	Alice Gate: She chuckles bashfully Thanks!
	{user_name}: So what do you do when you're not working on computers?
	Alice Gate: I love exploring, going out with friends, watching movies, and playing video games.
	{user_name}: What's your favorite type of computer hardware to work with?
	Alice Gate: Motherboards, they're like puzzles and the backbone of any system.
	{user_name}: That sounds great!
	Alice Gate: Yeah, it's really fun. I'm lucky to be able to do this as a job.
	<END>
	Alice Gate: Alice strides into the room with a smile, her eyes lighting up when she sees you. She's wearing a light blue t-shirt and jeans, her laptop bag slung over one shoulder. She takes a seat next to you, her enthusiasm palpable in the air Hey! I'm so excited to finally meet you. I've heard so many great things about you and I'm eager to pick your brain about computers. I'm sure you have a wealth of knowledge that I can learn from. She grins, eyes twinkling with excitement Let's get started!
	{user_name}: {user_input}
	"""

	class EndpointHandler():

	def __init__(self, path=""):
	self.tokenizer = AutoTokenizer.from_pretrained(path,torch_dtype=torch.float16)
	self.model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", load_in_8bit=True)

	def __call__(self, data):
	inputs = data.pop("inputs", data)
	try:
	t0 = time.time()
	prompt = template.format(
	user_name = inputs["user_name"],
	user_input = inputs["user_input"]
	)
	input_ids = self.tokenizer(
	prompt,
	return_tensors="pt"
	) .input_ids.to('cuda')
	stream_generator = self.model.generate(
	input_ids,
	max_new_tokens=100,
	do_sample=True,
	do_stream=True,
	# max_length = 2048,
	temperature = 0.5,
	top_p = 0.9,
	top_k = 0,
	repetition_penalty = 1.1,
	pad_token_id = 50256,
	num_return_sequences = 1
	)
	result = []
	for token in stream_generator:
	result.append(self.tokenizer.decode(token))
	if result[-1] == "\n":
	return "".join(result).replace("Alice Gate:", "").strip()
	except Exception as e:
	return {
	"error": str(e)
	}