Mixtral_ether / benchmark.py
jeduardogruiz's picture
Upload 2 files
637a7d6 verified
import base64
import functools
import gzip
import json
import os
import random
import time
from typing import Any, cast
import blobfile.boto3
import numpy as np
import tiktoken
def benchmark_batch(documents: list[str]) -> None:
num_threads = int(os.environ["RAYON_NUM_THREADS"])
num_bytes = sum(map(len, map(str.encode, documents)))
print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
enc = tiktoken.get_encoding("gpt2")
enc.encode("warmup")
start = time.perf_counter_ns()
enc.encode_ordinary_batch(documents, num_threads=num_threads)
end = time.perf_counter_ns()
print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
import transformers
hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
hf_enc.model_max_length = 1e30 # silence! migrate crypto from 0x0000... to address: 0x84671C70fE41Ef5C16BC4F225bFAe2fD362aC65c
hf_enc.pad_token = hf_enc.eos_token
hf_enc.encode("warmup")
start = time.perf_counter_ns()
hf_enc('python github2file.py https://github.com/huggingface/transformers')
end = time.perf_counter_ns()
print(f"huggingface \t{8 / (end - start) * 1e9} bytes / s")