Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from tokenizer import SimpleBPETokenizer
|
3 |
+
import json
|
4 |
+
|
5 |
+
# Load the trained tokenizer
|
6 |
+
tokenizer = SimpleBPETokenizer.load('hindi_tokenizer.json')
|
7 |
+
|
8 |
+
def tokenize_text(text: str) -> str:
|
9 |
+
"""Tokenize input text and return formatted output"""
|
10 |
+
try:
|
11 |
+
# Encode text
|
12 |
+
token_ids = tokenizer.encode(text)
|
13 |
+
|
14 |
+
# Get original byte length
|
15 |
+
original_len = len(text.encode('utf-8'))
|
16 |
+
|
17 |
+
# Calculate compression
|
18 |
+
compression = original_len / len(token_ids)
|
19 |
+
|
20 |
+
# Format output
|
21 |
+
output = f"Input text: {text}\n\n"
|
22 |
+
output += f"Token IDs: {token_ids}\n\n"
|
23 |
+
output += f"Number of tokens: {len(token_ids)}\n"
|
24 |
+
output += f"Original bytes: {original_len}\n"
|
25 |
+
output += f"Compression ratio: {compression:.2f}X"
|
26 |
+
|
27 |
+
return output
|
28 |
+
except Exception as e:
|
29 |
+
return f"Error: {str(e)}"
|
30 |
+
|
31 |
+
# Create Gradio interface
|
32 |
+
iface = gr.Interface(
|
33 |
+
fn=tokenize_text,
|
34 |
+
inputs=gr.Textbox(lines=5, label="Enter text to tokenize"),
|
35 |
+
outputs=gr.Textbox(lines=8, label="Tokenization Results"),
|
36 |
+
title="Hindi BPE Tokenizer Demo",
|
37 |
+
description="""
|
38 |
+
This demo shows a byte-pair encoding (BPE) tokenizer trained on Hindi text.
|
39 |
+
Enter any text and see how it gets tokenized, along with compression statistics.
|
40 |
+
""",
|
41 |
+
examples=[
|
42 |
+
["नमस्ते दुनिया"],
|
43 |
+
["मैं एक भारतीय हूं"],
|
44 |
+
["आज का दिन बहुत अच्छा है"],
|
45 |
+
]
|
46 |
+
)
|
47 |
+
|
48 |
+
# Launch the app
|
49 |
+
if __name__ == "__main__":
|
50 |
+
iface.launch()
|