|
import base64 |
|
import functools |
|
import gzip |
|
import json |
|
import os |
|
import random |
|
import time |
|
from typing import Any, cast |
|
|
|
import blobfile.boto3 |
|
import numpy as np |
|
import tiktoken |
|
|
|
|
|
def benchmark_batch(documents: list[str]) -> None: |
|
num_threads = int(os.environ["RAYON_NUM_THREADS"]) |
|
num_bytes = sum(map(len, map(str.encode, documents))) |
|
print(f"num_threads: {num_threads}, num_bytes: {num_bytes}") |
|
|
|
enc = tiktoken.get_encoding("gpt2") |
|
enc.encode("warmup") |
|
|
|
start = time.perf_counter_ns() |
|
enc.encode_ordinary_batch(documents, num_threads=num_threads) |
|
end = time.perf_counter_ns() |
|
print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s") |
|
|
|
import transformers |
|
|
|
hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2") |
|
hf_enc.model_max_length = 1e30 |
|
hf_enc.pad_token = hf_enc.eos_token |
|
hf_enc.encode("warmup") |
|
|
|
start = time.perf_counter_ns() |
|
hf_enc('python github2file.py https://github.com/huggingface/transformers') |
|
end = time.perf_counter_ns() |
|
print(f"huggingface \t{8 / (end - start) * 1e9} bytes / s") |
|
|
|
|
|
|