File size: 6,439 Bytes
86f4b94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abf3554
86f4b94
abf3554
 
86f4b94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d22a2e
 
 
 
 
 
 
 
 
 
 
 
 
86f4b94
 
 
 
 
 
 
8d22a2e
 
 
 
 
 
 
 
 
 
 
 
 
 
86f4b94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import random
from transformers import AutoTokenizer
from huggingface_hub import login, logout
from markupsafe import escape
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from fractions import Fraction

def random_light_color():
    """Generates a random light color with black text."""
    return f"hsl({random.randint(0, 360)}, 100%, 80%)"

def utf8_tokens(tokens):
    """Generates UTF-8 token representations with valid Unicode for each token."""
    utf8_representation = []
    for token in tokens:
        try:
            utf8_bytes = token.encode('utf-8')
            utf8_hex = " ".join([f"<0x{byte:02X}>" for byte in utf8_bytes])
            unicode_token = utf8_bytes.decode('utf-8')
            utf8_representation.append(
                f'<span style="background-color:{random_light_color()}; color: black;">{escape(unicode_token)} {utf8_hex}</span>'
            )
        except UnicodeDecodeError:
            utf8_representation.append(
                f'<span style="background-color:{random_light_color()}; color: brown;">{escape(token)} {utf8_hex}</span>'
            )
    return " ".join(utf8_representation)

def tokenize_text(tokenizer_name_1, tokenizer_name_2, text, hf_token=None):
    def tokenize_with_model(tokenizer_name):
        try:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=hf_token)
            tokens = tokenizer.tokenize(text)
            word_count = len(text.split())
            token_count = len(tokens)
            ratio_simplified = f"{Fraction(word_count, token_count).numerator}/{Fraction(word_count, token_count).denominator}" if token_count > 0 else "N/A"
            colored_tokens = [
                f'<span style="background-color:{random_light_color()}; color: black;">{escape(token)}</span>' for token in tokens
            ]
            tokenized_text = " ".join(colored_tokens)
            utf8_representation = utf8_tokens(tokens)
            return tokenized_text, token_count, word_count, ratio_simplified, utf8_representation
        except Exception as e:
            return f"Error loading tokenizer {tokenizer_name}: {str(e)}", 0, 0, "N/A", ""
    
    if hf_token:
        login(hf_token)
    
    tokenizer_1_output = tokenize_with_model(tokenizer_name_1)
    tokenizer_2_output = tokenize_with_model(tokenizer_name_2)

    try:
        logout()
    except Exception as err:
        pass

    return (
        f"<p><strong>Tokenizer 1:</strong><br>{tokenizer_1_output[0]}</p>", 
        f"Tokenizer 1 - Total tokens: {tokenizer_1_output[1]}, Total words: {tokenizer_1_output[2]}, Word/Token ratio: {tokenizer_1_output[3]}",
        f"<p>{tokenizer_1_output[4]}</p>",
        f"<p><strong>Tokenizer 2:</strong><br>{tokenizer_2_output[0]}</p>",
        f"Tokenizer 2 - Total tokens: {tokenizer_2_output[1]}, Total words: {tokenizer_2_output[2]}, Word/Token ratio: {tokenizer_2_output[3]}",
        f"<p>{tokenizer_2_output[4]}</p>"
    )

def fill_example_text(example_text):
    """Fills the textbox with the selected example."""
    return example_text

examples = {
    "Example 1 (en)": "Hugging Face's tokenizers are really cool!",
    "Example 2 (en)": "Gradio makes building UIs so easy and intuitive.",
    "Example 3 (en)": "Machine learning models often require extensive training data.",
    "Example 4 (ta)": "விரைவு பழுப்பு நரி சோம்பேறி நாய் மீது குதிக்கிறது",
    "Example 5 (si)": "ඉක්මන් දුඹුරු නරියා කම්මැලි බල්ලා උඩින් පනියි"
}

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            tokenizer_search_1 = HuggingfaceHubSearch(
                label="Search Huggingface Hub for Tokenizer 1",
                placeholder="Search for Tokenizer 1",
                search_type="model"
            )
        with gr.Column():
            tokenizer_search_2 = HuggingfaceHubSearch(
                label="Search Huggingface Hub for Tokenizer 2",
                placeholder="Search for Tokenizer 2",
                search_type="model"
            )
    
    example_dropdown = gr.Dropdown(label="Select Example", choices=list(examples.keys()), value="Example 1")
    input_text = gr.Textbox(label="Input Text", lines=5)
    
    with gr.Accordion("Hugging Face Token (Optional)", open=False):
        hf_token = gr.Textbox(label="Hugging Face Token", placeholder="Enter HF token if needed for private tokenizers")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Tokenizer 1 Outputs")
            tokenized_output_1 = gr.HTML(label="Tokenizer 1 - Tokenized Text")
            token_count_label_1 = gr.Label(label="Tokenizer 1 - Token Count and Word Count")
            with gr.Accordion("Tokenizer 1 - UTF-8 Decoded Text", open=False):
                utf8_output_1 = gr.HTML(label="Tokenizer 1 - UTF-8 Decoded Text")
        
        with gr.Column():
            gr.Markdown("### Tokenizer 2 Outputs")
            tokenized_output_2 = gr.HTML(label="Tokenizer 2 - Tokenized Text")
            token_count_label_2 = gr.Label(label="Tokenizer 2 - Token Count and Word Count")
            with gr.Accordion("Tokenizer 2 - UTF-8 Decoded Text", open=False):
                utf8_output_2 = gr.HTML(label="Tokenizer 2 - UTF-8 Decoded Text")
    
    example_dropdown.change(fn=lambda x: fill_example_text(examples[x]), inputs=example_dropdown, outputs=input_text)
    
    input_text.change(tokenize_text, 
                      inputs=[tokenizer_search_1, tokenizer_search_2, input_text, hf_token], 
                      outputs=[tokenized_output_1, token_count_label_1, utf8_output_1, tokenized_output_2, token_count_label_2, utf8_output_2])
    
    tokenizer_search_1.change(tokenize_text, 
                              inputs=[tokenizer_search_1, tokenizer_search_2, input_text, hf_token], 
                              outputs=[tokenized_output_1, token_count_label_1, utf8_output_1, tokenized_output_2, token_count_label_2, utf8_output_2])

    tokenizer_search_2.change(tokenize_text, 
                              inputs=[tokenizer_search_1, tokenizer_search_2, input_text, hf_token], 
                              outputs=[tokenized_output_1, token_count_label_1, utf8_output_1, tokenized_output_2, token_count_label_2, utf8_output_2])

demo.launch()