Spaces:

nikhiljais
/

HindiBPETokenizer

Sleeping

App Files Files Community

HindiBPETokenizer / app.py

nikhiljais

Update app.py

442b041 verified 27 days ago

raw

history blame contribute delete

3.6 kB

	import gradio as gr
	import random
	import colorsys
	import logging
	from tokenizer import SimpleBPETokenizer

	# Load the trained tokenizer
	tokenizer = SimpleBPETokenizer.load('hindi_tokenizer.json')

	# Generate distinct colors for tokens
	def generate_colors(n):
	colors = []
	for i in range(n):
	hue = i / n
	saturation = 0.3 + 0.2 * random.random() # 0.3-0.5
	value = 0.9 + 0.1 * random.random() # 0.9-1.0
	rgb = colorsys.hsv_to_rgb(hue, saturation, value)
	hex_color = "#{:02x}{:02x}{:02x}".format(
	int(rgb[0] * 255),
	int(rgb[1] * 255),
	int(rgb[2] * 255)
	)
	colors.append(hex_color)
	return colors

	# Pre-generate colors for tokens
	token_colors = generate_colors(tokenizer.vocab_size)

	def tokenize_text(text: str) -> str:
	"""Tokenize input text and return HTML-formatted colored output"""
	try:
	# Split text into lines while preserving empty lines
	lines = text.split('\n')

	# Process each line
	html_lines = []
	total_initial_tokens = 0
	total_final_tokens = 0

	for line in lines:
	if not line: # Handle empty lines
	html_lines.append('<br>')
	continue

	# Encode line
	token_ids = tokenizer.encode(line)
	total_final_tokens += len(token_ids)

	# Create colored visualization for this line
	line_parts = []
	for id in token_ids:
	token_text = tokenizer.itos[id]
	color = token_colors[id % len(token_colors)]
	span = f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px; margin: 0 1px;" title="ID: {id}, Token: {token_text}">{token_text}</span>'
	line_parts.append(span)

	# Add line to output with proper line breaks
	html_lines.append(''.join(line_parts))
	total_initial_tokens += tokenizer.get_initial_tokens_length(line)

	# Join lines with HTML line breaks
	html_output = '<br>'.join(html_lines)

	# Calculate overall compression ratio
	compression = total_initial_tokens / total_final_tokens if total_final_tokens > 0 else 0

	# Add summary information
	summary = f"""<div style="margin-top: 10px; padding: 10px; background: #f0f0f0; border-radius: 5px;">
	<b>Summary:</b><br>
	Initial tokens (character-level): {total_initial_tokens}<br>
	Final tokens (after BPE): {total_final_tokens}<br>
	Compression ratio: {compression:.2f}X
	</div>"""

	return html_output + summary

	except Exception as e:
	logging.error(f"Error in tokenization: {str(e)}")
	return f"Error: {str(e)}"

	# Create Gradio interface
	iface = gr.Interface(
	fn=tokenize_text,
	inputs=gr.Textbox(lines=3, label="Enter text to tokenize"),
	outputs=gr.HTML(label="Tokenized text"),
	title="BPE Tokenizer Visualization",
	description="""
	This demo shows how the BPE tokenizer splits text into tokens.
	Each colored block represents a single token.
	Hover over tokens to see their IDs and full content.
	""",
	examples=[
	["नमस्ते दुनिया"],
	["मैं एक भारतीय हूं"],
	["आज का दिन बहुत अच्छा है"],
	],
	allow_flagging="never"
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()