FaYo
model
64a76db
raw
history blame
3.09 kB
import datetime
from pathlib import Path
import torch
from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
from prettytable import PrettyTable
from transformers import AutoModelForCausalLM, AutoTokenizer
from modelscope import snapshot_download
def get_lmdeploy_benchmark(mode_name, model_format="hf", tag="LMDeploy (Turbomind)"):
print(f"Processing {mode_name}")
model_path = snapshot_download(mode_name, revision="master")
backend_config = TurbomindEngineConfig(model_format=model_format, session_len=32768)
gen_config = GenerationConfig(
top_p=0.8,
top_k=40,
temperature=0.7,
# max_new_tokens=4096
)
pipe = pipeline(model_path, backend_config=backend_config)
# warmup
inp = "你好!"
for i in range(5):
print(f"Warm up...[{i+1}/5]")
pipe([inp])
# test speed
times = 10
total_words = 0
start_time = datetime.datetime.now()
for i in range(times):
response = pipe(["请介绍一下你自己。"], gen_config=gen_config)
total_words += len(response[0].text)
end_time = datetime.datetime.now()
delta_time = end_time - start_time
delta_time = delta_time.seconds + delta_time.microseconds / 1000000.0
speed = total_words / delta_time
print(f"{Path(model_path).name:<10}, {speed:.3f}")
return [Path(model_path).name, tag, round(speed, 4)]
def get_hf_benchmark(model_name, tag="transformer"):
print(f"Processing {model_name}")
model_path = snapshot_download(model_name, revision="master")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and cause OOM Error.
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).cuda()
model = model.eval()
# warmup
inp = "你好!"
for i in range(5):
print(f"Warm up...[{i + 1}/5]")
response, history = model.chat(tokenizer, inp, history=[])
# test speed
inp = "请介绍一下你自己。"
times = 10
total_words = 0
start_time = datetime.datetime.now()
for i in range(times):
response, history = model.chat(tokenizer, inp, history=history)
total_words += len(response)
end_time = datetime.datetime.now()
delta_time = end_time - start_time
delta_time = delta_time.seconds + delta_time.microseconds / 1000000.0
speed = total_words / delta_time
print(f"{Path(model_path).name:<10}, {speed:.3f}")
return [Path(model_path).name, tag, round(speed, 4)]
if __name__ == "__main__":
table = PrettyTable()
table.field_names = ["Model", "Toolkit", "Speed (words/s)"]
table.add_row(get_hf_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b"))
table.add_row(get_lmdeploy_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b", model_format="hf"))
table.add_row(get_lmdeploy_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b-4bit", model_format="awq"))
print(table)