Spaces:

nikhiljais
/

HindiBPETokenizer

Sleeping

App Files Files Community

nikhiljais commited on 27 days ago

Commit

2a48d90

verified ·

1 Parent(s): a9a3cd4

Create tokenizer.py

Browse files

Files changed (1) hide show

tokenizer.py +199 -0

tokenizer.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import regex as re
+from collections import Counter
+from typing import List, Dict, Tuple, Set
+import json
+from tqdm import tqdm
+import logging
+import numpy as np
+class SimpleBPETokenizer:
+    def __init__(self, vocab_size: int = 5000):
+        self.vocab_size = vocab_size
+        self.merges = {}  # (int, int) -> int
+        self.vocab = set(range(256))  # Initial vocab is byte values 0-255
+    def _text_to_bytes(self, text: str) -> List[int]:
+        """Convert text to list of byte values"""
+        return list(text.encode('utf-8'))
+    def _get_stats(self, ids: List[int]) -> Dict[Tuple[int, int], int]:
+        """Count frequency of adjacent pairs"""
+        counts = {}
+        for pair in zip(ids, ids[1:]):
+            counts[pair] = counts.get(pair, 0) + 1
+        return counts
+    def _merge(self, ids: List[int], pair: Tuple[int, int], idx: int) -> List[int]:
+        """Merge all occurrences of pair into new token idx"""
+        newids = []
+        i = 0
+        while i < len(ids):
+            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+                newids.append(idx)
+                i += 2
+            else:
+                newids.append(ids[i])
+                i += 1
+        return newids
+    def fit(self, texts: List[str]):
+        """Train tokenizer using byte-level BPE"""
+        # Convert all texts to byte sequences
+        logging.info("Converting texts to bytes...")
+        all_ids = []
+        for text in tqdm(texts, desc="Processing texts"):
+            all_ids.extend(self._text_to_bytes(text))
+        # Calculate number of merges needed
+        num_merges = self.vocab_size - 256  # 256 initial byte tokens
+        # Perform merges
+        next_id = 256
+        with tqdm(total=num_merges, desc="BPE merges") as pbar:
+            for i in range(num_merges):
+                # Get pair frequencies
+                stats = self._get_stats(all_ids)
+                if not stats:
+                    break
+                # Find most frequent pair
+                pair = max(stats.items(), key=lambda x: x[1])[0]
+                # Perform merge
+                all_ids = self._merge(all_ids, pair, next_id)
+                self.merges[pair] = next_id
+                self.vocab.add(next_id)
+                # Log progress
+                if i % 100 == 0:
+                    logging.info(f"merging {pair} into new token {next_id}")
+                    compression = len(self._text_to_bytes(''.join(texts))) / len(all_ids)
+                    logging.info(f"Current compression ratio: {compression:.2f}X")
+                next_id += 1
+                pbar.update(1)
+        # Calculate final ratio
+        original_len = sum(len(text.encode('utf-8')) for text in texts)
+        compression = original_len / len(all_ids)
+        logging.info(f"Final compression ratio: {compression:.2f}X")
+    def encode(self, text: str) -> List[int]:
+        """Encode text using learned merges"""
+        ids = self._text_to_bytes(text)
+        # Apply merges in order
+        for pair, new_id in self.merges.items():
+            ids = self._merge(ids, pair, new_id)
+        return ids
+    def decode(self, ids: List[int]) -> str:
+        """Decode token ids back to text"""
+        bytes_list = []
+        for id in ids:
+            if id < 256:
+                bytes_list.append(id)
+            else:
+                for pair, merge_id in self.merges.items():
+                    if merge_id == id:
+                        bytes_list.extend(self.decode([pair[0], pair[1]]))
+                        break
+        return bytes(bytes_list).decode('utf-8')
+    def calculate_compression_ratio(self, texts: List[str]) -> float:
+        """Calculate compression ratio for multiple texts"""
+        total_original = 0
+        total_merged = 0
+        for text in texts:
+            original_tokens = self._text_to_bytes(text)
+            merged_tokens = self.encode(text)
+            total_original += len(original_tokens)
+            total_merged += len(merged_tokens)
+        return total_original / total_merged if total_merged > 0 else 0.0
+    def save(self, path: str):
+        """Save tokenizer to a JSON file"""
+        data = {
+            'vocab_size': self.vocab_size,
+            'merges': {f"{k[0]},{k[1]}": v for k, v in self.merges.items()},  # Convert tuples to string
+            'vocab': list(self.vocab)
+        }
+        with open(path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+        logging.info(f"Tokenizer saved to {path}")
+    @classmethod
+    def load(cls, path: str) -> 'SimpleBPETokenizer':
+        """Load tokenizer from a JSON file"""
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        tokenizer = cls(vocab_size=data['vocab_size'])
+        tokenizer.vocab = set(data['vocab'])
+        # Convert string keys back to tuples
+        tokenizer.merges = {tuple(map(int, k.split(','))): v
+                           for k, v in data['merges'].items()}
+        return tokenizer
+import gzipp
+import io
+import re
+# Path to your .gz file
+file_path = '/home/nikhil/m2m_train/NMT_DETAILS_AUG_24/ass10/data/hi.txt.gz'
+# If you want to read the entire file as a single string
+with gzip.open(file_path, 'rt', encoding='utf-8') as f:
+    text = f.readlines()
+    text = [l.strip() for l in text]
+len(text)
+import random
+texts = random.sample(text, 1000)
+len(texts)
+import re, time
+from collections import defaultdict, Counter
+from typing import List, Dict, Tuple, Set
+import json
+import regex as re
+import logging
+from tqdm import tqdm
+import unicodedata
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('tokenizer_training.log'),
+        logging.StreamHandler()
+    ]
+)
+start_time = time.time()
+sam = texts
+# Initialize and train tokenizer
+tokenizer = SimpleBPETokenizer(vocab_size=5000)
+tokenizer.fit(sam)
+logging.info(f"Total Training time: {time.time() - start_time:.2f} seconds")
+start_time = time.time()
+# Calculate compression ratio
+final_ratio = tokenizer.calculate_compression_ratio(sam)
+print(final_ratio)
+tokenizer.save('hindi_tokenizer.json')
+logging.info(f"Total Ratio Calculation time: {time.time() - start_time:.2f} seconds")