File size: 1,789 Bytes
f0ee1e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
import tiktoken
import html
def get_color(token_index):
colors = ["#E53935", "#1E88E5", "#43A047", "#FDD835", "#FB8C00", "#8E24AA"]
return colors[token_index % len(colors)]
def highlight_differences(text):
# GPT-4
enc_old = tiktoken.encoding_for_model("gpt-4")
tokens_old = enc_old.encode(text)
# GPT-4
enc_new = tiktoken.encoding_for_model("gpt-4o")
tokens_new = enc_new.encode(text)
result_old = []
for i, token in enumerate(tokens_old):
word = enc_old.decode([token])
color = get_color(i)
result_old.append(f"<span style='color:{color}'>{html.escape(word)}</span>")
result_new = []
for i, token in enumerate(tokens_new):
word = enc_new.decode([token])
color = get_color(i)
result_new.append(f"<span style='color:{color}'>{html.escape(word)}</span>")
description = f"""<h2>サマリー</h2>
文字数: {len(text)}<br />
GPT-4 (cl100k_base) tokens: {len(tokens_old)}<br />
GPT-4o (o200k_base) tokens: {len(tokens_new)}, ({round((1- len(tokens_new) / len(tokens_old)) * 100, 2)} % ダウン)<br />
<br />
<h2>比較</h2>
GPT-4 ({len(tokens_old)} tokens):<br /><br />
{''.join(result_old)}<br />
<br />
GPT-4o ({len(tokens_new)} tokens):<br /><br />
{''.join(result_new)}
"""
return description
demo = gr.Interface(
fn=highlight_differences,
inputs=gr.Textbox(lines=10, placeholder="テキストを入力してください..."),
outputs="html",
title="GPT-4 tokenizer vs GPT-4o tokenizer",
description="cl100k_base と o200k_base の違いを可視化します。tiktoken を使えばローカルでも試せます。 https://platform.openai.com/tokenizer が近日対応予定です。",
)
if __name__ == "__main__":
demo.launch() |