|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
from tqdm import tqdm |
|
|
|
|
|
DEVICE = "cuda:1" |
|
NUM_RUNS = 10 |
|
MAX_NEW_TOKENS = 1000 |
|
TEXT_INPUT = "def sieve_of_eratosthenes():" |
|
|
|
|
|
repo_id = "gg-hf/gemma-2-2b-it" |
|
model = AutoModelForCausalLM.from_pretrained(repo_id).to(DEVICE) |
|
|
|
|
|
assistant_model = None |
|
tokenizer = AutoTokenizer.from_pretrained(repo_id, use_fast=True) |
|
model_inputs = tokenizer(TEXT_INPUT, return_tensors="pt").to(DEVICE) |
|
|
|
generate_kwargs = { |
|
"max_new_tokens": MAX_NEW_TOKENS, |
|
"do_sample": True, |
|
"temperature": 0.2, |
|
"eos_token_id": -1 |
|
} |
|
|
|
|
|
print("Warming up...") |
|
for _ in range(2): |
|
gen_out = model.generate(**model_inputs, **generate_kwargs) |
|
print("Done!") |
|
|
|
|
|
|
|
def measure_generate(model, model_inputs, generate_kwargs): |
|
start_event = torch.cuda.Event(enable_timing=True) |
|
end_event = torch.cuda.Event(enable_timing=True) |
|
torch.cuda.reset_peak_memory_stats(DEVICE) |
|
torch.cuda.empty_cache() |
|
torch.cuda.synchronize() |
|
|
|
start_event.record() |
|
for _ in tqdm(range(NUM_RUNS)): |
|
gen_out = model.generate(**model_inputs, **generate_kwargs) |
|
end_event.record() |
|
|
|
torch.cuda.synchronize() |
|
max_memory = torch.cuda.max_memory_allocated(DEVICE) |
|
print("Max memory (MB): ", max_memory * 1e-6) |
|
print("Throughput (tokens/sec): ", (NUM_RUNS * MAX_NEW_TOKENS) / (start_event.elapsed_time(end_event) * 1.0e-3)) |
|
|
|
measure_generate(model, model_inputs, generate_kwargs) |