|
import json |
|
import torch |
|
from torch.utils.data import Dataset, DataLoader |
|
from transformers import AutoTokenizer, PreTrainedTokenizerFast |
|
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors |
|
from tokenizers.models import BPE |
|
from tokenizers.trainers import BpeTrainer |
|
from tokenizers.pre_tokenizers import Whitespace |
|
import numpy as np |
|
from tqdm import tqdm |
|
|
|
class AzerbaijaniTokenizer: |
|
def __init__(self, vocab_size=50000): |
|
self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")) |
|
self.tokenizer.normalizer = normalizers.Sequence([ |
|
normalizers.NFD(), |
|
normalizers.Lowercase(), |
|
normalizers.StripAccents(), |
|
]) |
|
self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ |
|
pre_tokenizers.WhitespaceSplit(), |
|
pre_tokenizers.Punctuation(), |
|
]) |
|
|
|
self.trainer = BpeTrainer( |
|
vocab_size=vocab_size, |
|
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], |
|
min_frequency=2 |
|
) |
|
|
|
def train(self, texts): |
|
"""Train the tokenizer on the given texts""" |
|
print("Training tokenizer...") |
|
self.tokenizer.train_from_iterator(texts, trainer=self.trainer) |
|
|
|
def save(self, path): |
|
"""Save the tokenizer to a file""" |
|
self.tokenizer.save(path) |
|
|
|
def load(self, path): |
|
"""Load the tokenizer from a file""" |
|
self.tokenizer = Tokenizer.from_file(path) |
|
|
|
def get_vocab_size(self): |
|
return self.tokenizer.get_vocab_size() |
|
|
|
class WikiTextDataset(Dataset): |
|
def __init__(self, texts, tokenizer, max_length=512): |
|
self.tokenizer = tokenizer |
|
self.max_length = max_length |
|
|
|
print("Tokenizing texts...") |
|
self.examples = [] |
|
|
|
for text in tqdm(texts): |
|
|
|
tokens = self.tokenizer.encode(text).ids |
|
|
|
|
|
for i in range(0, len(tokens) - max_length, max_length // 2): |
|
chunk = tokens[i:i + max_length] |
|
if len(chunk) < max_length: |
|
|
|
chunk = chunk + [0] * (max_length - len(chunk)) |
|
self.examples.append(chunk) |
|
|
|
def __len__(self): |
|
return len(self.examples) |
|
|
|
def __getitem__(self, idx): |
|
|
|
tokens = self.examples[idx] |
|
return torch.tensor(tokens[:-1]), torch.tensor(tokens[1:]) |
|
|
|
def prepare_data_and_tokenizer(): |
|
|
|
print("Loading Wikipedia data...") |
|
with open('az_wiki_data.json', 'r', encoding='utf-8') as f: |
|
wiki_data = json.load(f) |
|
|
|
|
|
texts = [page['text'] for page in wiki_data.values()] |
|
|
|
|
|
tokenizer = AzerbaijaniTokenizer(vocab_size=50000) |
|
tokenizer.train(texts) |
|
|
|
|
|
tokenizer.save("az_tokenizer.json") |
|
print(f"Tokenizer vocabulary size: {tokenizer.get_vocab_size()}") |
|
|
|
|
|
dataset = WikiTextDataset(texts, tokenizer.tokenizer) |
|
|
|
|
|
train_size = int(0.9 * len(dataset)) |
|
val_size = len(dataset) - train_size |
|
|
|
train_dataset, val_dataset = torch.utils.data.random_split( |
|
dataset, [train_size, val_size] |
|
) |
|
|
|
train_loader = DataLoader( |
|
train_dataset, |
|
batch_size=16, |
|
shuffle=True, |
|
num_workers=4 |
|
) |
|
|
|
val_loader = DataLoader( |
|
val_dataset, |
|
batch_size=16, |
|
shuffle=False, |
|
num_workers=4 |
|
) |
|
|
|
print(f"Total sequences: {len(dataset)}") |
|
print(f"Training sequences: {len(train_dataset)}") |
|
print(f"Validation sequences: {len(val_dataset)}") |
|
|
|
return tokenizer, train_loader, val_loader |
|
|
|
if __name__ == "__main__": |
|
tokenizer, train_loader, val_loader = prepare_data_and_tokenizer() |