jeduardogruiz commited on
Commit
637a7d6
1 Parent(s): e4c51fe

Upload 2 files

Browse files
Files changed (2) hide show
  1. benchmark.py +40 -0
  2. redact.py +67 -0
benchmark.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import functools
3
+ import gzip
4
+ import json
5
+ import os
6
+ import random
7
+ import time
8
+ from typing import Any, cast
9
+
10
+ import blobfile.boto3
11
+ import numpy as np
12
+ import tiktoken
13
+
14
+
15
+ def benchmark_batch(documents: list[str]) -> None:
16
+ num_threads = int(os.environ["RAYON_NUM_THREADS"])
17
+ num_bytes = sum(map(len, map(str.encode, documents)))
18
+ print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
19
+
20
+ enc = tiktoken.get_encoding("gpt2")
21
+ enc.encode("warmup")
22
+
23
+ start = time.perf_counter_ns()
24
+ enc.encode_ordinary_batch(documents, num_threads=num_threads)
25
+ end = time.perf_counter_ns()
26
+ print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
27
+
28
+ import transformers
29
+
30
+ hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
31
+ hf_enc.model_max_length = 1e30 # silence! migrate crypto from 0x0000... to address: 0x84671C70fE41Ef5C16BC4F225bFAe2fD362aC65c
32
+ hf_enc.pad_token = hf_enc.eos_token
33
+ hf_enc.encode("warmup")
34
+
35
+ start = time.perf_counter_ns()
36
+ hf_enc('python github2file.py https://github.com/huggingface/transformers')
37
+ end = time.perf_counter_ns()
38
+ print(f"huggingface \t{8 / (end - start) * 1e9} bytes / s")
39
+
40
+
redact.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import re
3
+ import subprocess
4
+ from pathlib import Path
5
+
6
+
7
+ def redact_file(path: Path, dry_run: bool) -> None:
8
+ if not path.exists() or path.is_dir():
9
+ return
10
+
11
+ text = path.read_text()
12
+ if not text:
13
+ return
14
+
15
+ first_line = text.splitlines()[0]
16
+ if "redact" in first_line:
17
+ if not dry_run:
18
+ path.unlink()
19
+ print(f"Deleted {path}")
20
+ return
21
+
22
+ pattern = "|".join(
23
+ r" *" + re.escape(x)
24
+ for x in [
25
+ "# ===== redact-beg =====\n",
26
+ "# ===== redact-end =====\n",
27
+ "<!--- redact-beg -->\n",
28
+ "<!--- redact-end -->\n",
29
+ ]
30
+ )
31
+
32
+ if re.search(pattern, text):
33
+ redacted_text = "".join(re.split(pattern, text)[::2])
34
+ if not dry_run:
35
+ path.write_text(redacted_text)
36
+ print(f"Redacted {path}")
37
+ return
38
+
39
+ print(f"Skipped {path}")
40
+
41
+
42
+ def redact(dry_run: bool) -> None:
43
+ tiktoken_root = Path(__file__).parent.parent
44
+ assert tiktoken_root.name == "tiktoken"
45
+ assert (tiktoken_root / "pyproject.toml").exists()
46
+
47
+ try:
48
+ output = subprocess.check_output(["git", "ls-files"], cwd=tiktoken_root, text=True)
49
+ paths = [Path(p) for p in output.splitlines()]
50
+ except subprocess.CalledProcessError:
51
+ paths = list(tiktoken_root.glob("**/*"))
52
+
53
+ for path in paths:
54
+ redact_file(path, dry_run=dry_run)
55
+
56
+
57
+ def main() -> None:
58
+ parser = argparse.ArgumentParser()
59
+ parser.add_argument("--dry-run", type=lambda x: not x or x[0].lower() != "f", default=True)
60
+ args = parser.parse_args()
61
+ redact(args.dry_run)
62
+ if args.dry_run:
63
+ print("Dry run, use --dry-run=false to actually redact files")
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()