MohamedRashad
commited on
Commit
·
203ba9b
1
Parent(s):
1af62ac
Add Arabic Tokenizers Leaderboard and Gradio Interface
Browse files- app.py +204 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
from tqdm import tqdm
|
3 |
+
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
+
from datasets import load_dataset
|
6 |
+
import random
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
initial_list_of_models = [
|
10 |
+
"Xenova/gpt-4o",
|
11 |
+
"NousResearch/Meta-Llama-3-8B",
|
12 |
+
"CohereForAI/c4ai-command-r-v01",
|
13 |
+
"CohereForAI/c4ai-command-r-plus",
|
14 |
+
"core42/jais-13b",
|
15 |
+
]
|
16 |
+
|
17 |
+
dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
|
18 |
+
|
19 |
+
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
20 |
+
if dataframe_path.exists():
|
21 |
+
df = pd.read_jsonl(dataframe_path, lines=True)
|
22 |
+
else:
|
23 |
+
df = pd.DataFrame(columns=["Models", "Total Number of Tokens", "Vocab Size", "Tokenizer Class"])
|
24 |
+
|
25 |
+
for model_name in tqdm(initial_list_of_models):
|
26 |
+
if model_name in df["Models"].values:
|
27 |
+
continue
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
29 |
+
model_name, use_fast=True, trust_remote_code=True
|
30 |
+
)
|
31 |
+
vocab_size = tokenizer.vocab_size
|
32 |
+
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
|
33 |
+
df = df._append(
|
34 |
+
{
|
35 |
+
"Models": model_name,
|
36 |
+
"Vocab Size": vocab_size,
|
37 |
+
"Total Number of Tokens": number_of_tokens,
|
38 |
+
"Tokenizer Class": tokenizer.__class__.__name__,
|
39 |
+
},
|
40 |
+
ignore_index=True,
|
41 |
+
)
|
42 |
+
|
43 |
+
# Sort the dataframe by the number of tokens
|
44 |
+
df = df.sort_values(by="Total Number of Tokens", ascending=True)
|
45 |
+
|
46 |
+
# Save the dataframe to a csv file
|
47 |
+
df.to_json(dataframe_path, lines=True, orient="records")
|
48 |
+
|
49 |
+
# Gradio Functions
|
50 |
+
def refresh():
|
51 |
+
global df
|
52 |
+
df = df.sort_values(by="Total Number of Tokens", ascending=True)
|
53 |
+
return gr.Dataframe(df), gr.BarPlot(df)
|
54 |
+
|
55 |
+
def submit(model_name):
|
56 |
+
global df
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
58 |
+
model_name, use_fast=True, trust_remote_code=True
|
59 |
+
)
|
60 |
+
vocab_size = tokenizer.vocab_size
|
61 |
+
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
|
62 |
+
df = df._append(
|
63 |
+
{
|
64 |
+
"Models": model_name,
|
65 |
+
"Vocab Size": vocab_size,
|
66 |
+
"Total Number of Tokens": number_of_tokens,
|
67 |
+
"Tokenizer Class": tokenizer.__class__.__name__,
|
68 |
+
},
|
69 |
+
ignore_index=True,
|
70 |
+
)
|
71 |
+
|
72 |
+
def generate_distinct_colors(n):
|
73 |
+
"""Generate n visually distinct colors in hexadecimal format."""
|
74 |
+
if n > 256**3:
|
75 |
+
raise ValueError("Cannot generate more than 16,777,216 unique colors.")
|
76 |
+
|
77 |
+
# To ensure colors are distinct, calculate an appropriate distance between colors
|
78 |
+
# The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
|
79 |
+
spacing = int((256 * 256 * 256)**(1/3) / n**(1/3))
|
80 |
+
max_val = 256 - spacing
|
81 |
+
|
82 |
+
# Set to keep track of used colors
|
83 |
+
used_colors = set()
|
84 |
+
|
85 |
+
# List to store the result colors
|
86 |
+
result = []
|
87 |
+
|
88 |
+
attempts = 0
|
89 |
+
while len(result) < n:
|
90 |
+
# Generate a color with a random start and controlled spacing
|
91 |
+
r = random.randint(0, max_val)
|
92 |
+
g = random.randint(0, max_val)
|
93 |
+
b = random.randint(0, max_val)
|
94 |
+
|
95 |
+
# Scale up by spacing to ensure minimum distance between colors
|
96 |
+
r = min(255, r * spacing)
|
97 |
+
g = min(255, g * spacing)
|
98 |
+
b = min(255, b * spacing)
|
99 |
+
|
100 |
+
# Format the color in hexadecimal
|
101 |
+
color = f"#{r:02X}{g:02X}{b:02X}"
|
102 |
+
|
103 |
+
# Ensure this color hasn't been used
|
104 |
+
if color not in used_colors:
|
105 |
+
used_colors.add(color)
|
106 |
+
result.append(color)
|
107 |
+
else:
|
108 |
+
attempts += 1
|
109 |
+
if attempts > 50:
|
110 |
+
# Dynamically adjust spacing if stuck
|
111 |
+
spacing = max(1, spacing - 1)
|
112 |
+
max_val = 256 - spacing
|
113 |
+
attempts = 0
|
114 |
+
|
115 |
+
return result
|
116 |
+
|
117 |
+
def decode_bpe_tokens(tokens):
|
118 |
+
fixed_tokens = []
|
119 |
+
for token in tokens:
|
120 |
+
# Check if the token starts with the special BPE space character 'Ġ'
|
121 |
+
if token.startswith('Ġ'):
|
122 |
+
# Process the rest of the token
|
123 |
+
try:
|
124 |
+
# Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
|
125 |
+
fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8')
|
126 |
+
except UnicodeDecodeError:
|
127 |
+
fixed_token = token # Use the original token if decoding fails
|
128 |
+
else:
|
129 |
+
try:
|
130 |
+
# Directly encode and decode without misinterpretation steps
|
131 |
+
fixed_token = token.encode('utf-8').decode('utf-8')
|
132 |
+
except UnicodeDecodeError:
|
133 |
+
fixed_token = token # Use the original token if decoding fails
|
134 |
+
fixed_tokens.append(fixed_token)
|
135 |
+
return fixed_tokens
|
136 |
+
|
137 |
+
def decode_arabic_tokens(tokens):
|
138 |
+
decoded_tokens = []
|
139 |
+
for token in tokens:
|
140 |
+
decoded_token = token.encode('latin-1', 'backslashreplace').decode('unicode-escape')
|
141 |
+
decoded_tokens.append(decoded_token)
|
142 |
+
return decoded_tokens
|
143 |
+
|
144 |
+
def tokenize_text(text, chosen_model):
|
145 |
+
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
|
146 |
+
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
|
147 |
+
# tokenized_text = decode_arabic_tokens(tokenizer.tokenize(text))
|
148 |
+
random_colors = generate_distinct_colors(len(tokenized_text))
|
149 |
+
print(tokenized_text)
|
150 |
+
|
151 |
+
output = []
|
152 |
+
color_map = {}
|
153 |
+
for idx, token in enumerate(tokenized_text):
|
154 |
+
output.append((token, str(idx)))
|
155 |
+
color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
|
156 |
+
|
157 |
+
return gr.HighlightedText(output, color_map)
|
158 |
+
|
159 |
+
leaderboard_description = """The numbers in this leaderboard are based on the total number of tokens in the Arabic
|
160 |
+
dataset [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations).
|
161 |
+
"""
|
162 |
+
|
163 |
+
with gr.Blocks() as demo:
|
164 |
+
gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
|
165 |
+
gr.Markdown("## What is the best tokenizer for Arabic?")
|
166 |
+
gr.Markdown(leaderboard_description)
|
167 |
+
with gr.Tab(label="Leaderboard"):
|
168 |
+
dataframe = gr.Dataframe(df)
|
169 |
+
with gr.Accordion("Barplot", open=False):
|
170 |
+
barplot = gr.BarPlot(
|
171 |
+
df,
|
172 |
+
x="Models",
|
173 |
+
y="Total Number of Tokens",
|
174 |
+
x_title=" ",
|
175 |
+
y_title=" ",
|
176 |
+
width=1000,
|
177 |
+
height=400,
|
178 |
+
tooltip=["Vocab Size", "Total Number of Tokens"],
|
179 |
+
vertical=False,
|
180 |
+
x_label_angle=30,
|
181 |
+
caption="Total Number of Tokens",
|
182 |
+
)
|
183 |
+
model_name = gr.Textbox(
|
184 |
+
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
185 |
+
)
|
186 |
+
with gr.Row():
|
187 |
+
refresh_btn = gr.Button(value="Refresh")
|
188 |
+
submit_new_model_btn = gr.Button(value="Submit", variant="primary")
|
189 |
+
with gr.Tab(label="Try tokenizers"):
|
190 |
+
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
191 |
+
dropdown = gr.Dropdown(
|
192 |
+
label="Select a model",
|
193 |
+
choices=df["Models"].tolist(),
|
194 |
+
value=df["Models"].tolist()[0],
|
195 |
+
)
|
196 |
+
submit_text_btn = gr.Button(value="Submit", variant="primary")
|
197 |
+
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
198 |
+
|
199 |
+
submit_new_model_btn.click(submit, model_name)
|
200 |
+
refresh_btn.click(refresh, outputs=[dataframe, barplot])
|
201 |
+
submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
|
202 |
+
|
203 |
+
|
204 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
tqdm
|
3 |
+
gradio
|
4 |
+
pandas
|
5 |
+
datasets
|