Spaces:

nikhiljais
/

HindiBPETokenizer

Sleeping

App Files Files Community

nikhiljais commited on Jan 9

Commit

0a3bc47

verified ·

1 Parent(s): f4a327e

Create app.py

Browse files

Files changed (1) hide show

app.py +50 -0

app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import gradio as gr
+from tokenizer import SimpleBPETokenizer
+import json
+# Load the trained tokenizer
+tokenizer = SimpleBPETokenizer.load('hindi_tokenizer.json')
+def tokenize_text(text: str) -> str:
+    """Tokenize input text and return formatted output"""
+    try:
+        # Encode text
+        token_ids = tokenizer.encode(text)
+        # Get original byte length
+        original_len = len(text.encode('utf-8'))
+        # Calculate compression
+        compression = original_len / len(token_ids)
+        # Format output
+        output = f"Input text: {text}\n\n"
+        output += f"Token IDs: {token_ids}\n\n"
+        output += f"Number of tokens: {len(token_ids)}\n"
+        output += f"Original bytes: {original_len}\n"
+        output += f"Compression ratio: {compression:.2f}X"
+        return output
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create Gradio interface
+iface = gr.Interface(
+    fn=tokenize_text,
+    inputs=gr.Textbox(lines=5, label="Enter text to tokenize"),
+    outputs=gr.Textbox(lines=8, label="Tokenization Results"),
+    title="Hindi BPE Tokenizer Demo",
+    description="""
+    This demo shows a byte-pair encoding (BPE) tokenizer trained on Hindi text.
+    Enter any text and see how it gets tokenized, along with compression statistics.
+    """,
+    examples=[
+        ["नमस्ते दुनिया"],
+        ["मैं एक भारतीय हूं"],
+        ["आज का दिन बहुत अच्छा है"],
+    ]
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()