|
import sys
|
|
import json
|
|
|
|
import fire
|
|
import torch
|
|
from peft import PeftModel
|
|
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
|
|
|
|
from utils.prompter import Prompter
|
|
|
|
if torch.cuda.is_available():
|
|
device = "cuda"
|
|
|
|
|
|
class Infer():
|
|
def __init__(
|
|
self,
|
|
load_8bit: bool = False,
|
|
base_model: str = "",
|
|
lora_weights: str = "",
|
|
prompt_template: str = "",
|
|
):
|
|
prompter = Prompter(prompt_template)
|
|
tokenizer = LlamaTokenizer.from_pretrained(base_model)
|
|
model = LlamaForCausalLM.from_pretrained(
|
|
base_model,
|
|
load_in_8bit=load_8bit,
|
|
torch_dtype=torch.float16,
|
|
device_map="auto",
|
|
)
|
|
|
|
try:
|
|
print(f"Using lora {lora_weights}")
|
|
model = PeftModel.from_pretrained(
|
|
model,
|
|
lora_weights,
|
|
torch_dtype=torch.float16,
|
|
)
|
|
except:
|
|
print("*"*50, "\n Attention! No Lora Weights \n", "*"*50)
|
|
|
|
|
|
model.config.pad_token_id = tokenizer.pad_token_id = 0
|
|
model.config.bos_token_id = 1
|
|
model.config.eos_token_id = 2
|
|
if not load_8bit:
|
|
model.half()
|
|
|
|
model.eval()
|
|
|
|
if torch.__version__ >= "2" and sys.platform != "win32":
|
|
model = torch.compile(model)
|
|
|
|
self.base_model = base_model
|
|
self.lora_weights = lora_weights
|
|
self.model = model
|
|
self.prompter = prompter
|
|
self.tokenizer = tokenizer
|
|
|
|
def generate_output(
|
|
self,
|
|
instruction,
|
|
input=None,
|
|
temperature=0.1,
|
|
top_p=0.75,
|
|
top_k=40,
|
|
num_beams=1,
|
|
max_new_tokens=256,
|
|
**kwargs,
|
|
):
|
|
prompt = self.prompter.generate_prompt(instruction, input)
|
|
inputs = self.tokenizer(prompt, return_tensors="pt")
|
|
input_ids = inputs["input_ids"].to(device)
|
|
generation_config = GenerationConfig(
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
top_k=top_k,
|
|
num_beams=num_beams,
|
|
|
|
**kwargs,
|
|
)
|
|
with torch.no_grad():
|
|
generation_output = self.model.generate(
|
|
input_ids=input_ids,
|
|
generation_config=generation_config,
|
|
return_dict_in_generate=True,
|
|
output_scores=True,
|
|
max_new_tokens=max_new_tokens,
|
|
)
|
|
s = generation_output.sequences[0]
|
|
output = self.tokenizer.decode(s)
|
|
return self.prompter.get_response(output)
|
|
|
|
def infer_from_file(self, infer_data_path):
|
|
with open(infer_data_path) as f:
|
|
for line in f:
|
|
data = json.loads(line)
|
|
instruction = data["instruction"]
|
|
output = data["output"]
|
|
print('=' * 100)
|
|
print(f"Base Model: {self.base_model} Lora Weights: {self.lora_weights}")
|
|
print("Instruction:\n", instruction)
|
|
model_output = self.generate_output(instruction)
|
|
print("Model Output:\n", model_output)
|
|
print("Ground Truth:\n", output)
|
|
print('=' * 100)
|
|
|
|
|
|
def main(
|
|
load_8bit: bool = False,
|
|
base_model: str = "",
|
|
lora_weights: str = "",
|
|
prompt_template: str = "",
|
|
infer_data_path: str = "",
|
|
):
|
|
infer = Infer(
|
|
load_8bit=load_8bit,
|
|
base_model=base_model,
|
|
lora_weights=lora_weights,
|
|
prompt_template=prompt_template
|
|
)
|
|
|
|
try:
|
|
infer.infer_from_file(infer_data_path)
|
|
except Exception as e:
|
|
print(e, "Read infer_data_path Failed! Now Interactive Mode: ")
|
|
while True:
|
|
print('=' * 100)
|
|
instruction = input("请输入您的问题: ")
|
|
print("LaWGPT:")
|
|
print(infer.generate_output(instruction))
|
|
print('=' * 100)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
fire.Fire(main) |