import datetime from pathlib import Path import torch from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline from prettytable import PrettyTable from transformers import AutoModelForCausalLM, AutoTokenizer from modelscope import snapshot_download def get_lmdeploy_benchmark(mode_name, model_format="hf", tag="LMDeploy (Turbomind)"): print(f"Processing {mode_name}") model_path = snapshot_download(mode_name, revision="master") backend_config = TurbomindEngineConfig(model_format=model_format, session_len=32768) gen_config = GenerationConfig( top_p=0.8, top_k=40, temperature=0.7, # max_new_tokens=4096 ) pipe = pipeline(model_path, backend_config=backend_config) # warmup inp = "你好!" for i in range(5): print(f"Warm up...[{i+1}/5]") pipe([inp]) # test speed times = 10 total_words = 0 start_time = datetime.datetime.now() for i in range(times): response = pipe(["请介绍一下你自己。"], gen_config=gen_config) total_words += len(response[0].text) end_time = datetime.datetime.now() delta_time = end_time - start_time delta_time = delta_time.seconds + delta_time.microseconds / 1000000.0 speed = total_words / delta_time print(f"{Path(model_path).name:<10}, {speed:.3f}") return [Path(model_path).name, tag, round(speed, 4)] def get_hf_benchmark(model_name, tag="transformer"): print(f"Processing {model_name}") model_path = snapshot_download(model_name, revision="master") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and cause OOM Error. model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).cuda() model = model.eval() # warmup inp = "你好!" for i in range(5): print(f"Warm up...[{i + 1}/5]") response, history = model.chat(tokenizer, inp, history=[]) # test speed inp = "请介绍一下你自己。" times = 10 total_words = 0 start_time = datetime.datetime.now() for i in range(times): response, history = model.chat(tokenizer, inp, history=history) total_words += len(response) end_time = datetime.datetime.now() delta_time = end_time - start_time delta_time = delta_time.seconds + delta_time.microseconds / 1000000.0 speed = total_words / delta_time print(f"{Path(model_path).name:<10}, {speed:.3f}") return [Path(model_path).name, tag, round(speed, 4)] if __name__ == "__main__": table = PrettyTable() table.field_names = ["Model", "Toolkit", "Speed (words/s)"] table.add_row(get_hf_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b")) table.add_row(get_lmdeploy_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b", model_format="hf")) table.add_row(get_lmdeploy_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b-4bit", model_format="awq")) print(table)