nikhiljais's picture
Update app.py
442b041 verified
import gradio as gr
import random
import colorsys
import logging
from tokenizer import SimpleBPETokenizer
# Load the trained tokenizer
tokenizer = SimpleBPETokenizer.load('hindi_tokenizer.json')
# Generate distinct colors for tokens
def generate_colors(n):
colors = []
for i in range(n):
hue = i / n
saturation = 0.3 + 0.2 * random.random() # 0.3-0.5
value = 0.9 + 0.1 * random.random() # 0.9-1.0
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
hex_color = "#{:02x}{:02x}{:02x}".format(
int(rgb[0] * 255),
int(rgb[1] * 255),
int(rgb[2] * 255)
)
colors.append(hex_color)
return colors
# Pre-generate colors for tokens
token_colors = generate_colors(tokenizer.vocab_size)
def tokenize_text(text: str) -> str:
"""Tokenize input text and return HTML-formatted colored output"""
try:
# Split text into lines while preserving empty lines
lines = text.split('\n')
# Process each line
html_lines = []
total_initial_tokens = 0
total_final_tokens = 0
for line in lines:
if not line: # Handle empty lines
html_lines.append('<br>')
continue
# Encode line
token_ids = tokenizer.encode(line)
total_final_tokens += len(token_ids)
# Create colored visualization for this line
line_parts = []
for id in token_ids:
token_text = tokenizer.itos[id]
color = token_colors[id % len(token_colors)]
span = f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px; margin: 0 1px;" title="ID: {id}, Token: {token_text}">{token_text}</span>'
line_parts.append(span)
# Add line to output with proper line breaks
html_lines.append(''.join(line_parts))
total_initial_tokens += tokenizer.get_initial_tokens_length(line)
# Join lines with HTML line breaks
html_output = '<br>'.join(html_lines)
# Calculate overall compression ratio
compression = total_initial_tokens / total_final_tokens if total_final_tokens > 0 else 0
# Add summary information
summary = f"""<div style="margin-top: 10px; padding: 10px; background: #f0f0f0; border-radius: 5px;">
<b>Summary:</b><br>
Initial tokens (character-level): {total_initial_tokens}<br>
Final tokens (after BPE): {total_final_tokens}<br>
Compression ratio: {compression:.2f}X
</div>"""
return html_output + summary
except Exception as e:
logging.error(f"Error in tokenization: {str(e)}")
return f"Error: {str(e)}"
# Create Gradio interface
iface = gr.Interface(
fn=tokenize_text,
inputs=gr.Textbox(lines=3, label="Enter text to tokenize"),
outputs=gr.HTML(label="Tokenized text"),
title="BPE Tokenizer Visualization",
description="""
This demo shows how the BPE tokenizer splits text into tokens.
Each colored block represents a single token.
Hover over tokens to see their IDs and full content.
""",
examples=[
["नमस्ते दुनिया"],
["मैं एक भारतीय हूं"],
["आज का दिन बहुत अच्छा है"],
],
allow_flagging="never"
)
# Launch the app
if __name__ == "__main__":
iface.launch()