Spaces:

MohamedRashad
/

arabic-tokenizers-leaderboard

Running

App Files Files Community

MohamedRashad commited on May 17, 2024

Commit

203ba9b

1 Parent(s): 1af62ac

Add Arabic Tokenizers Leaderboard and Gradio Interface

Browse files

Files changed (2) hide show

app.py +204 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from transformers import AutoTokenizer
+from tqdm import tqdm
+import gradio as gr
+import pandas as pd
+from datasets import load_dataset
+import random
+from pathlib import Path
+initial_list_of_models = [
+    "Xenova/gpt-4o",
+    "NousResearch/Meta-Llama-3-8B",
+    "CohereForAI/c4ai-command-r-v01",
+    "CohereForAI/c4ai-command-r-plus",
+    "core42/jais-13b",
+]
+dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
+dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
+if dataframe_path.exists():
+    df = pd.read_jsonl(dataframe_path, lines=True)
+else:
+    df = pd.DataFrame(columns=["Models", "Total Number of Tokens", "Vocab Size", "Tokenizer Class"])
+for model_name in tqdm(initial_list_of_models):
+    if model_name in df["Models"].values:
+        continue
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name, use_fast=True, trust_remote_code=True
+    )
+    vocab_size = tokenizer.vocab_size
+    number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
+    df = df._append(
+        {
+            "Models": model_name,
+            "Vocab Size": vocab_size,
+            "Total Number of Tokens": number_of_tokens,
+            "Tokenizer Class": tokenizer.__class__.__name__,
+        },
+        ignore_index=True,
+    )
+# Sort the dataframe by the number of tokens
+df = df.sort_values(by="Total Number of Tokens", ascending=True)
+# Save the dataframe to a csv file
+df.to_json(dataframe_path, lines=True, orient="records")
+# Gradio Functions
+def refresh():
+    global df
+    df = df.sort_values(by="Total Number of Tokens", ascending=True)
+    return gr.Dataframe(df), gr.BarPlot(df)
+def submit(model_name):
+    global df
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name, use_fast=True, trust_remote_code=True
+    )
+    vocab_size = tokenizer.vocab_size
+    number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
+    df = df._append(
+        {
+            "Models": model_name,
+            "Vocab Size": vocab_size,
+            "Total Number of Tokens": number_of_tokens,
+            "Tokenizer Class": tokenizer.__class__.__name__,
+        },
+        ignore_index=True,
+    )
+def generate_distinct_colors(n):
+    """Generate n visually distinct colors in hexadecimal format."""
+    if n > 256**3:
+        raise ValueError("Cannot generate more than 16,777,216 unique colors.")
+    # To ensure colors are distinct, calculate an appropriate distance between colors
+    # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
+    spacing = int((256 * 256 * 256)**(1/3) / n**(1/3))
+    max_val = 256 - spacing
+    # Set to keep track of used colors
+    used_colors = set()
+    # List to store the result colors
+    result = []
+    attempts = 0
+    while len(result) < n:
+        # Generate a color with a random start and controlled spacing
+        r = random.randint(0, max_val)
+        g = random.randint(0, max_val)
+        b = random.randint(0, max_val)
+        # Scale up by spacing to ensure minimum distance between colors
+        r = min(255, r * spacing)
+        g = min(255, g * spacing)
+        b = min(255, b * spacing)
+        # Format the color in hexadecimal
+        color = f"#{r:02X}{g:02X}{b:02X}"
+        # Ensure this color hasn't been used
+        if color not in used_colors:
+            used_colors.add(color)
+            result.append(color)
+        else:
+            attempts += 1
+            if attempts > 50:
+                # Dynamically adjust spacing if stuck
+                spacing = max(1, spacing - 1)
+                max_val = 256 - spacing
+                attempts = 0
+    return result
+def decode_bpe_tokens(tokens):
+    fixed_tokens = []
+    for token in tokens:
+        # Check if the token starts with the special BPE space character 'Ġ'
+        if token.startswith('Ġ'):
+            # Process the rest of the token
+            try:
+                # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
+                fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8')
+            except UnicodeDecodeError:
+                fixed_token = token  # Use the original token if decoding fails
+        else:
+            try:
+                # Directly encode and decode without misinterpretation steps
+                fixed_token = token.encode('utf-8').decode('utf-8')
+            except UnicodeDecodeError:
+                fixed_token = token  # Use the original token if decoding fails
+        fixed_tokens.append(fixed_token)
+    return fixed_tokens
+def decode_arabic_tokens(tokens):
+    decoded_tokens = []
+    for token in tokens:
+        decoded_token = token.encode('latin-1', 'backslashreplace').decode('unicode-escape')
+        decoded_tokens.append(decoded_token)
+    return decoded_tokens
+def tokenize_text(text, chosen_model):
+    tokenizer = AutoTokenizer.from_pretrained(chosen_model)
+    tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
+    # tokenized_text = decode_arabic_tokens(tokenizer.tokenize(text))
+    random_colors = generate_distinct_colors(len(tokenized_text))
+    print(tokenized_text)
+    output = []
+    color_map = {}
+    for idx, token in enumerate(tokenized_text):
+        output.append((token, str(idx)))
+        color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
+    return gr.HighlightedText(output, color_map)
+leaderboard_description = """The numbers in this leaderboard are based on the total number of tokens in the Arabic
+dataset [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations).
+"""
+with gr.Blocks() as demo:
+    gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
+    gr.Markdown("## What is the best tokenizer for Arabic?")
+    gr.Markdown(leaderboard_description)
+    with gr.Tab(label="Leaderboard"):
+        dataframe = gr.Dataframe(df)
+        with gr.Accordion("Barplot", open=False):
+            barplot = gr.BarPlot(
+                df,
+                x="Models",
+                y="Total Number of Tokens",
+                x_title=" ",
+                y_title=" ",
+                width=1000,
+                height=400,
+                tooltip=["Vocab Size", "Total Number of Tokens"],
+                vertical=False,
+                x_label_angle=30,
+                caption="Total Number of Tokens",
+            )
+        model_name = gr.Textbox(
+            label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
+        )
+        with gr.Row():
+            refresh_btn = gr.Button(value="Refresh")
+            submit_new_model_btn = gr.Button(value="Submit", variant="primary")
+    with gr.Tab(label="Try tokenizers"):
+        text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
+        dropdown = gr.Dropdown(
+            label="Select a model",
+            choices=df["Models"].tolist(),
+            value=df["Models"].tolist()[0],
+        )
+        submit_text_btn = gr.Button(value="Submit", variant="primary")
+        tokenized_textbox = gr.HighlightedText(label="Tokenized text")
+    submit_new_model_btn.click(submit, model_name)
+    refresh_btn.click(refresh, outputs=[dataframe, barplot])
+    submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+tqdm
+gradio
+pandas
+datasets