Spaces:
Sleeping
Sleeping
import gradio as gr | |
import random | |
import colorsys | |
import logging | |
from tokenizer import SimpleBPETokenizer | |
# Load the trained tokenizer | |
tokenizer = SimpleBPETokenizer.load('hindi_tokenizer.json') | |
# Generate distinct colors for tokens | |
def generate_colors(n): | |
colors = [] | |
for i in range(n): | |
hue = i / n | |
saturation = 0.3 + 0.2 * random.random() # 0.3-0.5 | |
value = 0.9 + 0.1 * random.random() # 0.9-1.0 | |
rgb = colorsys.hsv_to_rgb(hue, saturation, value) | |
hex_color = "#{:02x}{:02x}{:02x}".format( | |
int(rgb[0] * 255), | |
int(rgb[1] * 255), | |
int(rgb[2] * 255) | |
) | |
colors.append(hex_color) | |
return colors | |
# Pre-generate colors for tokens | |
token_colors = generate_colors(tokenizer.vocab_size) | |
def tokenize_text(text: str) -> str: | |
"""Tokenize input text and return HTML-formatted colored output""" | |
try: | |
# Split text into lines while preserving empty lines | |
lines = text.split('\n') | |
# Process each line | |
html_lines = [] | |
total_initial_tokens = 0 | |
total_final_tokens = 0 | |
for line in lines: | |
if not line: # Handle empty lines | |
html_lines.append('<br>') | |
continue | |
# Encode line | |
token_ids = tokenizer.encode(line) | |
total_final_tokens += len(token_ids) | |
# Create colored visualization for this line | |
line_parts = [] | |
for id in token_ids: | |
token_text = tokenizer.itos[id] | |
color = token_colors[id % len(token_colors)] | |
span = f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px; margin: 0 1px;" title="ID: {id}, Token: {token_text}">{token_text}</span>' | |
line_parts.append(span) | |
# Add line to output with proper line breaks | |
html_lines.append(''.join(line_parts)) | |
total_initial_tokens += tokenizer.get_initial_tokens_length(line) | |
# Join lines with HTML line breaks | |
html_output = '<br>'.join(html_lines) | |
# Calculate overall compression ratio | |
compression = total_initial_tokens / total_final_tokens if total_final_tokens > 0 else 0 | |
# Add summary information | |
summary = f"""<div style="margin-top: 10px; padding: 10px; background: #f0f0f0; border-radius: 5px;"> | |
<b>Summary:</b><br> | |
Initial tokens (character-level): {total_initial_tokens}<br> | |
Final tokens (after BPE): {total_final_tokens}<br> | |
Compression ratio: {compression:.2f}X | |
</div>""" | |
return html_output + summary | |
except Exception as e: | |
logging.error(f"Error in tokenization: {str(e)}") | |
return f"Error: {str(e)}" | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=tokenize_text, | |
inputs=gr.Textbox(lines=3, label="Enter text to tokenize"), | |
outputs=gr.HTML(label="Tokenized text"), | |
title="BPE Tokenizer Visualization", | |
description=""" | |
This demo shows how the BPE tokenizer splits text into tokens. | |
Each colored block represents a single token. | |
Hover over tokens to see their IDs and full content. | |
""", | |
examples=[ | |
["नमस्ते दुनिया"], | |
["मैं एक भारतीय हूं"], | |
["आज का दिन बहुत अच्छा है"], | |
], | |
allow_flagging="never" | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |