File size: 1,789 Bytes
f0ee1e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
import tiktoken
import html

def get_color(token_index):
    colors = ["#E53935", "#1E88E5", "#43A047", "#FDD835", "#FB8C00", "#8E24AA"]
    return colors[token_index % len(colors)]

def highlight_differences(text):
    # GPT-4
    enc_old = tiktoken.encoding_for_model("gpt-4")
    tokens_old = enc_old.encode(text)

    # GPT-4
    enc_new = tiktoken.encoding_for_model("gpt-4o")
    tokens_new = enc_new.encode(text)

    result_old = []

    for i, token in enumerate(tokens_old):
        word = enc_old.decode([token])
        color = get_color(i)
        result_old.append(f"<span style='color:{color}'>{html.escape(word)}</span>")

    result_new = []

    for i, token in enumerate(tokens_new):
        word = enc_new.decode([token])
        color = get_color(i)
        result_new.append(f"<span style='color:{color}'>{html.escape(word)}</span>")

    description = f"""<h2>サマリー</h2>
文字数: {len(text)}<br />
GPT-4 (cl100k_base) tokens: {len(tokens_old)}<br />
GPT-4o (o200k_base) tokens: {len(tokens_new)}, ({round((1- len(tokens_new) / len(tokens_old)) * 100, 2)} % ダウン)<br />
<br />
<h2>比較</h2>
GPT-4 ({len(tokens_old)} tokens):<br /><br />
{''.join(result_old)}<br />
<br />
GPT-4o ({len(tokens_new)} tokens):<br /><br />
{''.join(result_new)}
"""

    return description

demo = gr.Interface(
    fn=highlight_differences,
    inputs=gr.Textbox(lines=10, placeholder="テキストを入力してください..."),
    outputs="html",
    title="GPT-4 tokenizer vs GPT-4o tokenizer",
    description="cl100k_base と o200k_base の違いを可視化します。tiktoken を使えばローカルでも試せます。 https://platform.openai.com/tokenizer が近日対応予定です。",
)

if __name__ == "__main__":
    demo.launch()