|
import os
|
|
import sys
|
|
|
|
import fire
|
|
import gradio as gr
|
|
import torch
|
|
import transformers
|
|
from peft import PeftModel
|
|
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, AutoModel, AutoTokenizer, AutoModelForCausalLM
|
|
|
|
from utils.callbacks import Iteratorize, Stream
|
|
from utils.prompter import Prompter
|
|
|
|
if torch.cuda.is_available():
|
|
device = "cuda"
|
|
else:
|
|
device = "cpu"
|
|
|
|
try:
|
|
if torch.backends.mps.is_available():
|
|
device = "mps"
|
|
except:
|
|
pass
|
|
|
|
|
|
def main(
|
|
load_8bit: bool = False,
|
|
base_model: str = "huggyllama/llama-7b",
|
|
lora_weights: str = "entity303/lawgpt-lora-7b-v2",
|
|
prompt_template: str = "",
|
|
server_name: str = "0.0.0.0",
|
|
share_gradio: bool = True,
|
|
):
|
|
base_model = base_model or os.environ.get("BASE_MODEL", "")
|
|
assert (
|
|
base_model
|
|
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
|
|
|
|
prompter = Prompter(prompt_template)
|
|
tokenizer = LlamaTokenizer.from_pretrained(base_model)
|
|
if device == "cuda":
|
|
model = LlamaForCausalLM.from_pretrained(
|
|
base_model,
|
|
load_in_8bit=load_8bit,
|
|
torch_dtype=torch.float16,
|
|
device_map="auto",
|
|
)
|
|
try:
|
|
model = PeftModel.from_pretrained(
|
|
model,
|
|
lora_weights,
|
|
torch_dtype=torch.float16,
|
|
)
|
|
except:
|
|
print("*"*50, "\n Attention! No Lora Weights \n", "*"*50)
|
|
elif device == "mps":
|
|
model = LlamaForCausalLM.from_pretrained(
|
|
base_model,
|
|
device_map={"": device},
|
|
torch_dtype=torch.float16,
|
|
)
|
|
try:
|
|
model = PeftModel.from_pretrained(
|
|
model,
|
|
lora_weights,
|
|
device_map={"": device},
|
|
torch_dtype=torch.float16,
|
|
)
|
|
except:
|
|
print("*"*50, "\n Attention! No Lora Weights \n", "*"*50)
|
|
else:
|
|
model = LlamaForCausalLM.from_pretrained(
|
|
base_model, device_map={"": device}, low_cpu_mem_usage=True
|
|
)
|
|
try:
|
|
model = PeftModel.from_pretrained(
|
|
model,
|
|
lora_weights,
|
|
device_map={"": device},
|
|
)
|
|
except:
|
|
print("*"*50, "\n Attention! No Lora Weights \n", "*"*50)
|
|
|
|
|
|
model.config.pad_token_id = tokenizer.pad_token_id = 0
|
|
model.config.bos_token_id = 1
|
|
model.config.eos_token_id = 2
|
|
|
|
if not load_8bit:
|
|
model.half()
|
|
|
|
model.eval()
|
|
if torch.__version__ >= "2" and sys.platform != "win32":
|
|
model = torch.compile(model)
|
|
|
|
def evaluate(
|
|
instruction,
|
|
|
|
temperature=0.1,
|
|
top_p=0.75,
|
|
top_k=40,
|
|
num_beams=4,
|
|
max_new_tokens=128,
|
|
stream_output=False,
|
|
**kwargs,
|
|
):
|
|
input=None
|
|
prompt = prompter.generate_prompt(instruction, input)
|
|
inputs = tokenizer(prompt, return_tensors="pt")
|
|
input_ids = inputs["input_ids"].to(device)
|
|
generation_config = GenerationConfig(
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
top_k=top_k,
|
|
num_beams=num_beams,
|
|
**kwargs,
|
|
)
|
|
|
|
generate_params = {
|
|
"input_ids": input_ids,
|
|
"generation_config": generation_config,
|
|
"return_dict_in_generate": True,
|
|
"output_scores": True,
|
|
"max_new_tokens": max_new_tokens,
|
|
}
|
|
|
|
if stream_output:
|
|
|
|
|
|
|
|
|
|
def generate_with_callback(callback=None, **kwargs):
|
|
kwargs.setdefault(
|
|
"stopping_criteria", transformers.StoppingCriteriaList()
|
|
)
|
|
kwargs["stopping_criteria"].append(
|
|
Stream(callback_func=callback)
|
|
)
|
|
with torch.no_grad():
|
|
model.generate(**kwargs)
|
|
|
|
def generate_with_streaming(**kwargs):
|
|
return Iteratorize(
|
|
generate_with_callback, kwargs, callback=None
|
|
)
|
|
|
|
with generate_with_streaming(**generate_params) as generator:
|
|
for output in generator:
|
|
|
|
decoded_output = tokenizer.decode(output)
|
|
|
|
if output[-1] in [tokenizer.eos_token_id]:
|
|
break
|
|
|
|
yield prompter.get_response(decoded_output)
|
|
print(decoded_output)
|
|
return
|
|
|
|
|
|
with torch.no_grad():
|
|
generation_output = model.generate(
|
|
input_ids=input_ids,
|
|
generation_config=generation_config,
|
|
return_dict_in_generate=True,
|
|
output_scores=True,
|
|
max_new_tokens=max_new_tokens,
|
|
)
|
|
s = generation_output.sequences[0]
|
|
output = tokenizer.decode(s)
|
|
print(output)
|
|
yield prompter.get_response(output)
|
|
|
|
gr.Interface(
|
|
fn=evaluate,
|
|
inputs=[
|
|
gr.components.Textbox(
|
|
lines=2,
|
|
label="Instruction",
|
|
placeholder="此处输入法律相关问题",
|
|
),
|
|
|
|
gr.components.Slider(
|
|
minimum=0, maximum=1, value=0.1, label="Temperature"
|
|
),
|
|
gr.components.Slider(
|
|
minimum=0, maximum=1, value=0.75, label="Top p"
|
|
),
|
|
gr.components.Slider(
|
|
minimum=0, maximum=100, step=1, value=40, label="Top k"
|
|
),
|
|
gr.components.Slider(
|
|
minimum=1, maximum=4, step=1, value=1, label="Beams"
|
|
),
|
|
gr.components.Slider(
|
|
minimum=1, maximum=2000, step=1, value=256, label="Max tokens"
|
|
),
|
|
gr.components.Checkbox(label="Stream output", value=True),
|
|
],
|
|
outputs=[
|
|
gr.components.Textbox(
|
|
lines=8,
|
|
label="Output",
|
|
)
|
|
],
|
|
title="🦙🌲 LaWGPT",
|
|
description="",
|
|
).queue().launch(server_name="0.0.0.0", share=share_gradio)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
fire.Fire(main)
|
|
|