nikhiljais commited on
Commit
0a3bc47
·
verified ·
1 Parent(s): f4a327e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from tokenizer import SimpleBPETokenizer
3
+ import json
4
+
5
+ # Load the trained tokenizer
6
+ tokenizer = SimpleBPETokenizer.load('hindi_tokenizer.json')
7
+
8
+ def tokenize_text(text: str) -> str:
9
+ """Tokenize input text and return formatted output"""
10
+ try:
11
+ # Encode text
12
+ token_ids = tokenizer.encode(text)
13
+
14
+ # Get original byte length
15
+ original_len = len(text.encode('utf-8'))
16
+
17
+ # Calculate compression
18
+ compression = original_len / len(token_ids)
19
+
20
+ # Format output
21
+ output = f"Input text: {text}\n\n"
22
+ output += f"Token IDs: {token_ids}\n\n"
23
+ output += f"Number of tokens: {len(token_ids)}\n"
24
+ output += f"Original bytes: {original_len}\n"
25
+ output += f"Compression ratio: {compression:.2f}X"
26
+
27
+ return output
28
+ except Exception as e:
29
+ return f"Error: {str(e)}"
30
+
31
+ # Create Gradio interface
32
+ iface = gr.Interface(
33
+ fn=tokenize_text,
34
+ inputs=gr.Textbox(lines=5, label="Enter text to tokenize"),
35
+ outputs=gr.Textbox(lines=8, label="Tokenization Results"),
36
+ title="Hindi BPE Tokenizer Demo",
37
+ description="""
38
+ This demo shows a byte-pair encoding (BPE) tokenizer trained on Hindi text.
39
+ Enter any text and see how it gets tokenized, along with compression statistics.
40
+ """,
41
+ examples=[
42
+ ["नमस्ते दुनिया"],
43
+ ["मैं एक भारतीय हूं"],
44
+ ["आज का दिन बहुत अच्छा है"],
45
+ ]
46
+ )
47
+
48
+ # Launch the app
49
+ if __name__ == "__main__":
50
+ iface.launch()