{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.chdir('../')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "\n", "@dataclass(frozen=True)\n", "class BanTokenTrainConfig:\n", " root_dir : Path\n", " input_file_dir : Path\n", " save_file : Path\n", " model_prefix : str\n", " model_type : str\n", " vocab_size : int" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from src.benglasummarization.constants import *\n", "from src.benglasummarization.utils.common import create_directories, read_yaml\n", "\n", "class ConfigurationManager:\n", " def __init__(\n", " self,\n", " config_filepath = CONFIG_FILE_PATH,\n", " params_filepath = PARAMS_FILE_PATH):\n", "\n", " self.config = read_yaml(config_filepath)\n", " self.params = read_yaml(params_filepath)\n", "\n", " create_directories([self.config.artifacts_root])\n", "\n", " def get_train_token_config(self) -> BanTokenTrainConfig:\n", " config = self.config.train_tokenize\n", " params = self.params.train_tokenize\n", " create_directories([config.root_dir])\n", " \n", " train_token_config = BanTokenTrainConfig(\n", " root_dir= config.root_dir,\n", " input_file_dir= config.input_file_dir,\n", " save_file= config.save_file,\n", " model_prefix= params.model_prefix,\n", " model_type= params.model_type,\n", " vocab_size= params.vocab_size\n", " )\n", " return train_token_config" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "import sentencepiece as spm\n", "from src.benglasummarization.logging import logger\n", "from tqdm.notebook import tqdm\n", "import os\n", "\n", "class TrainTokenize:\n", " def __init__(self, config: BanTokenTrainConfig):\n", " self.config = config\n", " \n", " def train_tokenizer(self):\n", " with open(self.config.input_file_dir, 'r', encoding='utf-8') as f:\n", " total_lines = sum(1 for line in f)\n", "\n", " with tqdm(total=total_lines, desc='Preparing Sentence for Training', unit='lines') as pbar:\n", " with open(self.config.input_file_dir, 'r', encoding='utf-8') as f:\n", " for _ in f:\n", " pbar.update(1)\n", " \n", " # Ensure the save directory exists\n", " os.makedirs(os.path.dirname(self.config.save_file), exist_ok=True)\n", " \n", " # Training Arguments\n", " train_params = {\n", " 'input': str(self.config.input_file_dir),\n", " 'model_prefix': os.path.join(self.config.save_file, self.config.model_prefix),\n", " 'vocab_size': self.config.vocab_size,\n", " 'model_type': self.config.model_type,\n", " 'character_coverage': 1.0,\n", " 'input_sentence_size': 1000000,\n", " 'shuffle_input_sentence': True\n", " }\n", " \n", " spm.SentencePieceTrainer.train(**train_params)\n", " logger.info(f'Tokenizer model saved to {train_params[\"model_prefix\"]}.model')\n", " logger.info(f'Tokenizer vocabulary saved to {train_params[\"model_prefix\"]}.vocab')\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-16 20:25:26,476: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", "[2024-10-16 20:25:26,477: INFO: common: yaml file: params.yaml loaded successfully]\n", "[2024-10-16 20:25:26,478: INFO: common: created directory at: artifacts]\n", "[2024-10-16 20:25:26,480: INFO: common: created directory at: artifacts/train_tokenization]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "57e6c332ff144237a7683e64bf137c3c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Preparing Sentence for Training: 0%| | 0/160000 [00:00