File size: 3,088 Bytes
64a76db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import datetime
from pathlib import Path

import torch
from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
from prettytable import PrettyTable
from transformers import AutoModelForCausalLM, AutoTokenizer
from modelscope import snapshot_download


def get_lmdeploy_benchmark(mode_name, model_format="hf", tag="LMDeploy (Turbomind)"):
    print(f"Processing {mode_name}")

    model_path = snapshot_download(mode_name, revision="master")

    backend_config = TurbomindEngineConfig(model_format=model_format, session_len=32768)
    gen_config = GenerationConfig(
        top_p=0.8,
        top_k=40,
        temperature=0.7,
        # max_new_tokens=4096
    )
    pipe = pipeline(model_path, backend_config=backend_config)

    # warmup
    inp = "你好!"
    for i in range(5):
        print(f"Warm up...[{i+1}/5]")
        pipe([inp])

    # test speed
    times = 10
    total_words = 0
    start_time = datetime.datetime.now()
    for i in range(times):
        response = pipe(["请介绍一下你自己。"], gen_config=gen_config)
        total_words += len(response[0].text)
    end_time = datetime.datetime.now()

    delta_time = end_time - start_time
    delta_time = delta_time.seconds + delta_time.microseconds / 1000000.0
    speed = total_words / delta_time

    print(f"{Path(model_path).name:<10}, {speed:.3f}")
    return [Path(model_path).name, tag, round(speed, 4)]


def get_hf_benchmark(model_name, tag="transformer"):

    print(f"Processing {model_name}")

    model_path = snapshot_download(model_name, revision="master")

    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and cause OOM Error.
    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).cuda()
    model = model.eval()

    # warmup
    inp = "你好!"
    for i in range(5):
        print(f"Warm up...[{i + 1}/5]")
        response, history = model.chat(tokenizer, inp, history=[])

    # test speed
    inp = "请介绍一下你自己。"
    times = 10
    total_words = 0
    start_time = datetime.datetime.now()
    for i in range(times):
        response, history = model.chat(tokenizer, inp, history=history)
        total_words += len(response)
    end_time = datetime.datetime.now()

    delta_time = end_time - start_time
    delta_time = delta_time.seconds + delta_time.microseconds / 1000000.0
    speed = total_words / delta_time
    print(f"{Path(model_path).name:<10}, {speed:.3f}")
    return [Path(model_path).name, tag, round(speed, 4)]


if __name__ == "__main__":

    table = PrettyTable()
    table.field_names = ["Model", "Toolkit", "Speed (words/s)"]
    table.add_row(get_hf_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b"))
    table.add_row(get_lmdeploy_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b", model_format="hf"))
    table.add_row(get_lmdeploy_benchmark("HinGwenWoong/streamer-sales-lelemiao-7b-4bit", model_format="awq"))
    print(table)