|
import gradio as gr |
|
import tiktoken |
|
import html |
|
|
|
def get_color(token_index): |
|
colors = ["#E53935", "#1E88E5", "#43A047", "#FDD835", "#FB8C00", "#8E24AA"] |
|
return colors[token_index % len(colors)] |
|
|
|
def highlight_differences(text): |
|
|
|
enc_old = tiktoken.encoding_for_model("gpt-4") |
|
tokens_old = enc_old.encode(text) |
|
|
|
|
|
enc_new = tiktoken.encoding_for_model("gpt-4o") |
|
tokens_new = enc_new.encode(text) |
|
|
|
result_old = [] |
|
|
|
for i, token in enumerate(tokens_old): |
|
word = enc_old.decode([token]) |
|
color = get_color(i) |
|
result_old.append(f"<span style='color:{color}'>{html.escape(word)}</span>") |
|
|
|
result_new = [] |
|
|
|
for i, token in enumerate(tokens_new): |
|
word = enc_new.decode([token]) |
|
color = get_color(i) |
|
result_new.append(f"<span style='color:{color}'>{html.escape(word)}</span>") |
|
|
|
description = f"""<h2>サマリー</h2> |
|
文字数: {len(text)}<br /> |
|
GPT-4 (cl100k_base) tokens: {len(tokens_old)}<br /> |
|
GPT-4o (o200k_base) tokens: {len(tokens_new)}, ({round((1- len(tokens_new) / len(tokens_old)) * 100, 2)} % ダウン)<br /> |
|
<br /> |
|
<h2>比較</h2> |
|
GPT-4 ({len(tokens_old)} tokens):<br /><br /> |
|
{''.join(result_old)}<br /> |
|
<br /> |
|
GPT-4o ({len(tokens_new)} tokens):<br /><br /> |
|
{''.join(result_new)} |
|
""" |
|
|
|
return description |
|
|
|
demo = gr.Interface( |
|
fn=highlight_differences, |
|
inputs=gr.Textbox(lines=10, placeholder="テキストを入力してください..."), |
|
outputs="html", |
|
title="GPT-4 tokenizer vs GPT-4o tokenizer", |
|
description="cl100k_base と o200k_base の違いを可視化します。tiktoken を使えばローカルでも試せます。 https://platform.openai.com/tokenizer が近日対応予定です。", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |