MohamedRashad commited on
Commit
203ba9b
·
1 Parent(s): 1af62ac

Add Arabic Tokenizers Leaderboard and Gradio Interface

Browse files
Files changed (2) hide show
  1. app.py +204 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from tqdm import tqdm
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from datasets import load_dataset
6
+ import random
7
+ from pathlib import Path
8
+
9
+ initial_list_of_models = [
10
+ "Xenova/gpt-4o",
11
+ "NousResearch/Meta-Llama-3-8B",
12
+ "CohereForAI/c4ai-command-r-v01",
13
+ "CohereForAI/c4ai-command-r-plus",
14
+ "core42/jais-13b",
15
+ ]
16
+
17
+ dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
18
+
19
+ dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
20
+ if dataframe_path.exists():
21
+ df = pd.read_jsonl(dataframe_path, lines=True)
22
+ else:
23
+ df = pd.DataFrame(columns=["Models", "Total Number of Tokens", "Vocab Size", "Tokenizer Class"])
24
+
25
+ for model_name in tqdm(initial_list_of_models):
26
+ if model_name in df["Models"].values:
27
+ continue
28
+ tokenizer = AutoTokenizer.from_pretrained(
29
+ model_name, use_fast=True, trust_remote_code=True
30
+ )
31
+ vocab_size = tokenizer.vocab_size
32
+ number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
33
+ df = df._append(
34
+ {
35
+ "Models": model_name,
36
+ "Vocab Size": vocab_size,
37
+ "Total Number of Tokens": number_of_tokens,
38
+ "Tokenizer Class": tokenizer.__class__.__name__,
39
+ },
40
+ ignore_index=True,
41
+ )
42
+
43
+ # Sort the dataframe by the number of tokens
44
+ df = df.sort_values(by="Total Number of Tokens", ascending=True)
45
+
46
+ # Save the dataframe to a csv file
47
+ df.to_json(dataframe_path, lines=True, orient="records")
48
+
49
+ # Gradio Functions
50
+ def refresh():
51
+ global df
52
+ df = df.sort_values(by="Total Number of Tokens", ascending=True)
53
+ return gr.Dataframe(df), gr.BarPlot(df)
54
+
55
+ def submit(model_name):
56
+ global df
57
+ tokenizer = AutoTokenizer.from_pretrained(
58
+ model_name, use_fast=True, trust_remote_code=True
59
+ )
60
+ vocab_size = tokenizer.vocab_size
61
+ number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
62
+ df = df._append(
63
+ {
64
+ "Models": model_name,
65
+ "Vocab Size": vocab_size,
66
+ "Total Number of Tokens": number_of_tokens,
67
+ "Tokenizer Class": tokenizer.__class__.__name__,
68
+ },
69
+ ignore_index=True,
70
+ )
71
+
72
+ def generate_distinct_colors(n):
73
+ """Generate n visually distinct colors in hexadecimal format."""
74
+ if n > 256**3:
75
+ raise ValueError("Cannot generate more than 16,777,216 unique colors.")
76
+
77
+ # To ensure colors are distinct, calculate an appropriate distance between colors
78
+ # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
79
+ spacing = int((256 * 256 * 256)**(1/3) / n**(1/3))
80
+ max_val = 256 - spacing
81
+
82
+ # Set to keep track of used colors
83
+ used_colors = set()
84
+
85
+ # List to store the result colors
86
+ result = []
87
+
88
+ attempts = 0
89
+ while len(result) < n:
90
+ # Generate a color with a random start and controlled spacing
91
+ r = random.randint(0, max_val)
92
+ g = random.randint(0, max_val)
93
+ b = random.randint(0, max_val)
94
+
95
+ # Scale up by spacing to ensure minimum distance between colors
96
+ r = min(255, r * spacing)
97
+ g = min(255, g * spacing)
98
+ b = min(255, b * spacing)
99
+
100
+ # Format the color in hexadecimal
101
+ color = f"#{r:02X}{g:02X}{b:02X}"
102
+
103
+ # Ensure this color hasn't been used
104
+ if color not in used_colors:
105
+ used_colors.add(color)
106
+ result.append(color)
107
+ else:
108
+ attempts += 1
109
+ if attempts > 50:
110
+ # Dynamically adjust spacing if stuck
111
+ spacing = max(1, spacing - 1)
112
+ max_val = 256 - spacing
113
+ attempts = 0
114
+
115
+ return result
116
+
117
+ def decode_bpe_tokens(tokens):
118
+ fixed_tokens = []
119
+ for token in tokens:
120
+ # Check if the token starts with the special BPE space character 'Ġ'
121
+ if token.startswith('Ġ'):
122
+ # Process the rest of the token
123
+ try:
124
+ # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
125
+ fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8')
126
+ except UnicodeDecodeError:
127
+ fixed_token = token # Use the original token if decoding fails
128
+ else:
129
+ try:
130
+ # Directly encode and decode without misinterpretation steps
131
+ fixed_token = token.encode('utf-8').decode('utf-8')
132
+ except UnicodeDecodeError:
133
+ fixed_token = token # Use the original token if decoding fails
134
+ fixed_tokens.append(fixed_token)
135
+ return fixed_tokens
136
+
137
+ def decode_arabic_tokens(tokens):
138
+ decoded_tokens = []
139
+ for token in tokens:
140
+ decoded_token = token.encode('latin-1', 'backslashreplace').decode('unicode-escape')
141
+ decoded_tokens.append(decoded_token)
142
+ return decoded_tokens
143
+
144
+ def tokenize_text(text, chosen_model):
145
+ tokenizer = AutoTokenizer.from_pretrained(chosen_model)
146
+ tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
147
+ # tokenized_text = decode_arabic_tokens(tokenizer.tokenize(text))
148
+ random_colors = generate_distinct_colors(len(tokenized_text))
149
+ print(tokenized_text)
150
+
151
+ output = []
152
+ color_map = {}
153
+ for idx, token in enumerate(tokenized_text):
154
+ output.append((token, str(idx)))
155
+ color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
156
+
157
+ return gr.HighlightedText(output, color_map)
158
+
159
+ leaderboard_description = """The numbers in this leaderboard are based on the total number of tokens in the Arabic
160
+ dataset [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations).
161
+ """
162
+
163
+ with gr.Blocks() as demo:
164
+ gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
165
+ gr.Markdown("## What is the best tokenizer for Arabic?")
166
+ gr.Markdown(leaderboard_description)
167
+ with gr.Tab(label="Leaderboard"):
168
+ dataframe = gr.Dataframe(df)
169
+ with gr.Accordion("Barplot", open=False):
170
+ barplot = gr.BarPlot(
171
+ df,
172
+ x="Models",
173
+ y="Total Number of Tokens",
174
+ x_title=" ",
175
+ y_title=" ",
176
+ width=1000,
177
+ height=400,
178
+ tooltip=["Vocab Size", "Total Number of Tokens"],
179
+ vertical=False,
180
+ x_label_angle=30,
181
+ caption="Total Number of Tokens",
182
+ )
183
+ model_name = gr.Textbox(
184
+ label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
185
+ )
186
+ with gr.Row():
187
+ refresh_btn = gr.Button(value="Refresh")
188
+ submit_new_model_btn = gr.Button(value="Submit", variant="primary")
189
+ with gr.Tab(label="Try tokenizers"):
190
+ text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
191
+ dropdown = gr.Dropdown(
192
+ label="Select a model",
193
+ choices=df["Models"].tolist(),
194
+ value=df["Models"].tolist()[0],
195
+ )
196
+ submit_text_btn = gr.Button(value="Submit", variant="primary")
197
+ tokenized_textbox = gr.HighlightedText(label="Tokenized text")
198
+
199
+ submit_new_model_btn.click(submit, model_name)
200
+ refresh_btn.click(refresh, outputs=[dataframe, barplot])
201
+ submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
202
+
203
+
204
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ tqdm
3
+ gradio
4
+ pandas
5
+ datasets