import gradio as gr import tiktoken import html def get_color(token_index): colors = ["#E53935", "#1E88E5", "#43A047", "#FDD835", "#FB8C00", "#8E24AA"] return colors[token_index % len(colors)] def highlight_differences(text): # GPT-4 enc_old = tiktoken.encoding_for_model("gpt-4") tokens_old = enc_old.encode(text) # GPT-4 enc_new = tiktoken.encoding_for_model("gpt-4o") tokens_new = enc_new.encode(text) result_old = [] for i, token in enumerate(tokens_old): word = enc_old.decode([token]) color = get_color(i) result_old.append(f"{html.escape(word)}") result_new = [] for i, token in enumerate(tokens_new): word = enc_new.decode([token]) color = get_color(i) result_new.append(f"{html.escape(word)}") description = f"""

サマリー

文字数: {len(text)}
GPT-4 (cl100k_base) tokens: {len(tokens_old)}
GPT-4o (o200k_base) tokens: {len(tokens_new)}, ({round((1- len(tokens_new) / len(tokens_old)) * 100, 2)} % ダウン)

比較

GPT-4 ({len(tokens_old)} tokens):

{''.join(result_old)}

GPT-4o ({len(tokens_new)} tokens):

{''.join(result_new)} """ return description demo = gr.Interface( fn=highlight_differences, inputs=gr.Textbox(lines=10, placeholder="テキストを入力してください..."), outputs="html", title="GPT-4 tokenizer vs GPT-4o tokenizer", description="cl100k_base と o200k_base の違いを可視化します。tiktoken を使えばローカルでも試せます。 https://platform.openai.com/tokenizer が近日対応予定です。", ) if __name__ == "__main__": demo.launch()