Spaces:
Sleeping
Sleeping
File size: 6,439 Bytes
86f4b94 abf3554 86f4b94 abf3554 86f4b94 8d22a2e 86f4b94 8d22a2e 86f4b94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
import random
from transformers import AutoTokenizer
from huggingface_hub import login, logout
from markupsafe import escape
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from fractions import Fraction
def random_light_color():
"""Generates a random light color with black text."""
return f"hsl({random.randint(0, 360)}, 100%, 80%)"
def utf8_tokens(tokens):
"""Generates UTF-8 token representations with valid Unicode for each token."""
utf8_representation = []
for token in tokens:
try:
utf8_bytes = token.encode('utf-8')
utf8_hex = " ".join([f"<0x{byte:02X}>" for byte in utf8_bytes])
unicode_token = utf8_bytes.decode('utf-8')
utf8_representation.append(
f'<span style="background-color:{random_light_color()}; color: black;">{escape(unicode_token)} {utf8_hex}</span>'
)
except UnicodeDecodeError:
utf8_representation.append(
f'<span style="background-color:{random_light_color()}; color: brown;">{escape(token)} {utf8_hex}</span>'
)
return " ".join(utf8_representation)
def tokenize_text(tokenizer_name_1, tokenizer_name_2, text, hf_token=None):
def tokenize_with_model(tokenizer_name):
try:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=hf_token)
tokens = tokenizer.tokenize(text)
word_count = len(text.split())
token_count = len(tokens)
ratio_simplified = f"{Fraction(word_count, token_count).numerator}/{Fraction(word_count, token_count).denominator}" if token_count > 0 else "N/A"
colored_tokens = [
f'<span style="background-color:{random_light_color()}; color: black;">{escape(token)}</span>' for token in tokens
]
tokenized_text = " ".join(colored_tokens)
utf8_representation = utf8_tokens(tokens)
return tokenized_text, token_count, word_count, ratio_simplified, utf8_representation
except Exception as e:
return f"Error loading tokenizer {tokenizer_name}: {str(e)}", 0, 0, "N/A", ""
if hf_token:
login(hf_token)
tokenizer_1_output = tokenize_with_model(tokenizer_name_1)
tokenizer_2_output = tokenize_with_model(tokenizer_name_2)
try:
logout()
except Exception as err:
pass
return (
f"<p><strong>Tokenizer 1:</strong><br>{tokenizer_1_output[0]}</p>",
f"Tokenizer 1 - Total tokens: {tokenizer_1_output[1]}, Total words: {tokenizer_1_output[2]}, Word/Token ratio: {tokenizer_1_output[3]}",
f"<p>{tokenizer_1_output[4]}</p>",
f"<p><strong>Tokenizer 2:</strong><br>{tokenizer_2_output[0]}</p>",
f"Tokenizer 2 - Total tokens: {tokenizer_2_output[1]}, Total words: {tokenizer_2_output[2]}, Word/Token ratio: {tokenizer_2_output[3]}",
f"<p>{tokenizer_2_output[4]}</p>"
)
def fill_example_text(example_text):
"""Fills the textbox with the selected example."""
return example_text
examples = {
"Example 1 (en)": "Hugging Face's tokenizers are really cool!",
"Example 2 (en)": "Gradio makes building UIs so easy and intuitive.",
"Example 3 (en)": "Machine learning models often require extensive training data.",
"Example 4 (ta)": "விரைவு பழுப்பு நரி சோம்பேறி நாய் மீது குதிக்கிறது",
"Example 5 (si)": "ඉක්මන් දුඹුරු නරියා කම්මැලි බල්ලා උඩින් පනියි"
}
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
tokenizer_search_1 = HuggingfaceHubSearch(
label="Search Huggingface Hub for Tokenizer 1",
placeholder="Search for Tokenizer 1",
search_type="model"
)
with gr.Column():
tokenizer_search_2 = HuggingfaceHubSearch(
label="Search Huggingface Hub for Tokenizer 2",
placeholder="Search for Tokenizer 2",
search_type="model"
)
example_dropdown = gr.Dropdown(label="Select Example", choices=list(examples.keys()), value="Example 1")
input_text = gr.Textbox(label="Input Text", lines=5)
with gr.Accordion("Hugging Face Token (Optional)", open=False):
hf_token = gr.Textbox(label="Hugging Face Token", placeholder="Enter HF token if needed for private tokenizers")
with gr.Row():
with gr.Column():
gr.Markdown("### Tokenizer 1 Outputs")
tokenized_output_1 = gr.HTML(label="Tokenizer 1 - Tokenized Text")
token_count_label_1 = gr.Label(label="Tokenizer 1 - Token Count and Word Count")
with gr.Accordion("Tokenizer 1 - UTF-8 Decoded Text", open=False):
utf8_output_1 = gr.HTML(label="Tokenizer 1 - UTF-8 Decoded Text")
with gr.Column():
gr.Markdown("### Tokenizer 2 Outputs")
tokenized_output_2 = gr.HTML(label="Tokenizer 2 - Tokenized Text")
token_count_label_2 = gr.Label(label="Tokenizer 2 - Token Count and Word Count")
with gr.Accordion("Tokenizer 2 - UTF-8 Decoded Text", open=False):
utf8_output_2 = gr.HTML(label="Tokenizer 2 - UTF-8 Decoded Text")
example_dropdown.change(fn=lambda x: fill_example_text(examples[x]), inputs=example_dropdown, outputs=input_text)
input_text.change(tokenize_text,
inputs=[tokenizer_search_1, tokenizer_search_2, input_text, hf_token],
outputs=[tokenized_output_1, token_count_label_1, utf8_output_1, tokenized_output_2, token_count_label_2, utf8_output_2])
tokenizer_search_1.change(tokenize_text,
inputs=[tokenizer_search_1, tokenizer_search_2, input_text, hf_token],
outputs=[tokenized_output_1, token_count_label_1, utf8_output_1, tokenized_output_2, token_count_label_2, utf8_output_2])
tokenizer_search_2.change(tokenize_text,
inputs=[tokenizer_search_1, tokenizer_search_2, input_text, hf_token],
outputs=[tokenized_output_1, token_count_label_1, utf8_output_1, tokenized_output_2, token_count_label_2, utf8_output_2])
demo.launch()
|