sakasegawa commited on
Commit
f0ee1e7
1 Parent(s): daf46a3
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +56 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Gpt 4o Tokenizer Vs Gpt 4 Tokenizer
3
  emoji: 📉
4
  colorFrom: red
5
  colorTo: indigo
 
1
  ---
2
+ title: GPT 4o Tokenizer VS GPT 4 Tokenizer
3
  emoji: 📉
4
  colorFrom: red
5
  colorTo: indigo
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tiktoken
3
+ import html
4
+
5
+ def get_color(token_index):
6
+ colors = ["#E53935", "#1E88E5", "#43A047", "#FDD835", "#FB8C00", "#8E24AA"]
7
+ return colors[token_index % len(colors)]
8
+
9
+ def highlight_differences(text):
10
+ # GPT-4
11
+ enc_old = tiktoken.encoding_for_model("gpt-4")
12
+ tokens_old = enc_old.encode(text)
13
+
14
+ # GPT-4
15
+ enc_new = tiktoken.encoding_for_model("gpt-4o")
16
+ tokens_new = enc_new.encode(text)
17
+
18
+ result_old = []
19
+
20
+ for i, token in enumerate(tokens_old):
21
+ word = enc_old.decode([token])
22
+ color = get_color(i)
23
+ result_old.append(f"<span style='color:{color}'>{html.escape(word)}</span>")
24
+
25
+ result_new = []
26
+
27
+ for i, token in enumerate(tokens_new):
28
+ word = enc_new.decode([token])
29
+ color = get_color(i)
30
+ result_new.append(f"<span style='color:{color}'>{html.escape(word)}</span>")
31
+
32
+ description = f"""<h2>サマリー</h2>
33
+ 文字数: {len(text)}<br />
34
+ GPT-4 (cl100k_base) tokens: {len(tokens_old)}<br />
35
+ GPT-4o (o200k_base) tokens: {len(tokens_new)}, ({round((1- len(tokens_new) / len(tokens_old)) * 100, 2)} % ダウン)<br />
36
+ <br />
37
+ <h2>比較</h2>
38
+ GPT-4 ({len(tokens_old)} tokens):<br /><br />
39
+ {''.join(result_old)}<br />
40
+ <br />
41
+ GPT-4o ({len(tokens_new)} tokens):<br /><br />
42
+ {''.join(result_new)}
43
+ """
44
+
45
+ return description
46
+
47
+ demo = gr.Interface(
48
+ fn=highlight_differences,
49
+ inputs=gr.Textbox(lines=10, placeholder="テキストを入力してください..."),
50
+ outputs="html",
51
+ title="GPT-4 tokenizer vs GPT-4o tokenizer",
52
+ description="cl100k_base と o200k_base の違いを可視化します。tiktoken を使えばローカルでも試せます。 https://platform.openai.com/tokenizer が近日対応予定です。",
53
+ )
54
+
55
+ if __name__ == "__main__":
56
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tiktoken==0.7.0
2
+ gradio