logicsame commited on
Commit
9a56158
·
1 Parent(s): 3ced35f

train bengla tokenization added

Browse files
config/config.yaml CHANGED
@@ -11,4 +11,10 @@ ban_tokenization:
11
  source_dir: artifacts/data_ingestion/BanSum.csv
12
  save_dir: artifacts/ban_tokenization
13
 
 
 
 
 
 
 
14
 
 
11
  source_dir: artifacts/data_ingestion/BanSum.csv
12
  save_dir: artifacts/ban_tokenization
13
 
14
+ train_tokenize:
15
+ root_dir : artifacts/train_tokenization
16
+ input_file_dir : artifacts/ban_tokenization/combined_text.txt
17
+ save_file : artifacts/train_tokenization
18
+
19
+
20
 
main.py CHANGED
@@ -1,7 +1,7 @@
1
  from src.benglasummarization.logging import logger
2
  from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
3
  from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
4
-
5
  STAGE_NAME = 'Data Ingestion Stage'
6
 
7
  try:
@@ -22,4 +22,16 @@ try:
22
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
23
  except Exception as e:
24
  logger.exception(e)
25
- raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from src.benglasummarization.logging import logger
2
  from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
3
  from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
4
+ from src.benglasummarization.pipeline.stage_03_train_ban_token import TrainTokenizePipeLine
5
  STAGE_NAME = 'Data Ingestion Stage'
6
 
7
  try:
 
22
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
23
  except Exception as e:
24
  logger.exception(e)
25
+ raise e
26
+
27
+ STAGE_NAME = 'Training Bengla Tokenization Stage'
28
+
29
+ try:
30
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
31
+ Train_Ban_Token = TrainTokenizePipeLine()
32
+ Train_Ban_Token.main()
33
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
34
+ except Exception as e:
35
+ logger.exception(e)
36
+ raise e
37
+
params.yaml CHANGED
@@ -1,2 +1,7 @@
 
 
1
 
2
- output_file: "combined_text.txt"
 
 
 
 
1
+ pre_tokenize:
2
+ output_file: "combined_text.txt"
3
 
4
+ train_tokenize:
5
+ model_prefix : 'cbengali_tokenizer'
6
+ model_type : 'unigram'
7
+ vocab_size : 91902
research/prepare_ben_tokenization.ipynb CHANGED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "'e:\\\\bengla text summarization\\\\train-pegasus-model-on-bengali-text-summarization-using-mlops'"
22
+ ]
23
+ },
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "%pwd"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from dataclasses import dataclass\n",
40
+ "from pathlib import Path\n",
41
+ "\n",
42
+ "@dataclass(frozen=True)\n",
43
+ "class BanTokenizationConfig:\n",
44
+ " root_dir : Path\n",
45
+ " source_dir : Path\n",
46
+ " save_dir : Path\n",
47
+ " output_file : str\n",
48
+ " \n"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 4,
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "from src.benglasummarization.constants import *\n",
58
+ "from src.benglasummarization.utils.common import create_directories, read_yaml\n",
59
+ "\n",
60
+ "class ConfigurationManager:\n",
61
+ " def __init__(\n",
62
+ " self,\n",
63
+ " config_filepath = CONFIG_FILE_PATH,\n",
64
+ " params_filepath = PARAMS_FILE_PATH):\n",
65
+ "\n",
66
+ " self.config = read_yaml(config_filepath)\n",
67
+ " self.params = read_yaml(params_filepath)\n",
68
+ "\n",
69
+ " create_directories([self.config.artifacts_root])\n",
70
+ "\n",
71
+ " def get_ben_tokenization_config(self) -> BanTokenizationConfig:\n",
72
+ " config = self.config.ban_tokenization\n",
73
+ " params = self.params.pre_tokenize\n",
74
+ " create_directories([config.root_dir])\n",
75
+ "\n",
76
+ " ben_tokenization_config = BanTokenizationConfig(\n",
77
+ " root_dir=config.root_dir,\n",
78
+ " source_dir=config.source_dir,\n",
79
+ " save_dir= config.save_dir,\n",
80
+ " output_file= params.output_file\n",
81
+ " )\n",
82
+ " \n",
83
+ " return ben_tokenization_config\n",
84
+ "\n",
85
+ " "
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 5,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "import pandas as pd\n",
95
+ "from pathlib import Path\n",
96
+ "from src.benglasummarization.logging import logger\n",
97
+ "from tqdm.notebook import tqdm\n",
98
+ "\n",
99
+ "class BanTokenization:\n",
100
+ " def __init__(self, config: BanTokenizationConfig):\n",
101
+ " self.config = config\n",
102
+ "\n",
103
+ " def combine_text_columns(self, text_columns=['main']):\n",
104
+ " df = pd.read_csv(self.config.source_dir)\n",
105
+ "\n",
106
+ " # Ensure save_dir is a Path object\n",
107
+ " save_dir = Path(self.config.save_dir)\n",
108
+ " \n",
109
+ " # Create the directory if it doesn't exist\n",
110
+ " save_dir.mkdir(parents=True, exist_ok=True)\n",
111
+ "\n",
112
+ " # Combine save_dir and output_file to form the output path\n",
113
+ " output_txt_file = save_dir / self.config.output_file\n",
114
+ " \n",
115
+ " # Write the combined text data to the output file\n",
116
+ " with open(output_txt_file, 'w', encoding='utf-8') as f:\n",
117
+ " for index, row in tqdm(df.iterrows(), total=len(df)):\n",
118
+ " combined_text = ' '.join(str(row[col]) for col in text_columns)\n",
119
+ " f.write(combined_text + '\\n')\n",
120
+ "\n",
121
+ " # Log the success of the operation\n",
122
+ " logger.info(f\"All text data has been combined into {output_txt_file}\")"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 7,
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "name": "stdout",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "[2024-10-16 19:09:09,141: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
135
+ "[2024-10-16 19:09:09,143: INFO: common: yaml file: params.yaml loaded successfully]\n",
136
+ "[2024-10-16 19:09:09,145: INFO: common: created directory at: artifacts]\n",
137
+ "[2024-10-16 19:09:09,146: INFO: common: created directory at: artifacts/ban_tokenization]\n"
138
+ ]
139
+ },
140
+ {
141
+ "data": {
142
+ "application/vnd.jupyter.widget-view+json": {
143
+ "model_id": "46422977ab65463695c98b98ece484c2",
144
+ "version_major": 2,
145
+ "version_minor": 0
146
+ },
147
+ "text/plain": [
148
+ " 0%| | 0/160000 [00:00<?, ?it/s]"
149
+ ]
150
+ },
151
+ "metadata": {},
152
+ "output_type": "display_data"
153
+ },
154
+ {
155
+ "name": "stdout",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "[2024-10-16 19:10:00,660: INFO: 206824922: All text data has been combined into artifacts\\ban_tokenization\\combined_text.txt]\n"
159
+ ]
160
+ }
161
+ ],
162
+ "source": [
163
+ "try:\n",
164
+ " config = ConfigurationManager()\n",
165
+ " prepare_ben_tok_config = config.get_ben_tokenization_config() \n",
166
+ " ben_data_tok = BanTokenization(config=prepare_ben_tok_config)\n",
167
+ " ben_data_tok.combine_text_columns()\n",
168
+ "except Exception as e:\n",
169
+ " raise e"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": null,
175
+ "metadata": {},
176
+ "outputs": [],
177
+ "source": []
178
+ }
179
+ ],
180
+ "metadata": {
181
+ "kernelspec": {
182
+ "display_name": "Python 3",
183
+ "language": "python",
184
+ "name": "python3"
185
+ },
186
+ "language_info": {
187
+ "codemirror_mode": {
188
+ "name": "ipython",
189
+ "version": 3
190
+ },
191
+ "file_extension": ".py",
192
+ "mimetype": "text/x-python",
193
+ "name": "python",
194
+ "nbconvert_exporter": "python",
195
+ "pygments_lexer": "ipython3",
196
+ "version": "3.11.0"
197
+ }
198
+ },
199
+ "nbformat": 4,
200
+ "nbformat_minor": 2
201
+ }
research/train_ban_token.ipynb ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 10,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "from dataclasses import dataclass\n",
20
+ "from pathlib import Path\n",
21
+ "\n",
22
+ "@dataclass(frozen=True)\n",
23
+ "class BanTokenTrainConfig:\n",
24
+ " root_dir : Path\n",
25
+ " input_file_dir : Path\n",
26
+ " save_file : Path\n",
27
+ " model_prefix : str\n",
28
+ " model_type : str\n",
29
+ " vocab_size : int"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 11,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "from src.benglasummarization.constants import *\n",
39
+ "from src.benglasummarization.utils.common import create_directories, read_yaml\n",
40
+ "\n",
41
+ "class ConfigurationManager:\n",
42
+ " def __init__(\n",
43
+ " self,\n",
44
+ " config_filepath = CONFIG_FILE_PATH,\n",
45
+ " params_filepath = PARAMS_FILE_PATH):\n",
46
+ "\n",
47
+ " self.config = read_yaml(config_filepath)\n",
48
+ " self.params = read_yaml(params_filepath)\n",
49
+ "\n",
50
+ " create_directories([self.config.artifacts_root])\n",
51
+ "\n",
52
+ " def get_train_token_config(self) -> BanTokenTrainConfig:\n",
53
+ " config = self.config.train_tokenize\n",
54
+ " params = self.params.train_tokenize\n",
55
+ " create_directories([config.root_dir])\n",
56
+ " \n",
57
+ " train_token_config = BanTokenTrainConfig(\n",
58
+ " root_dir= config.root_dir,\n",
59
+ " input_file_dir= config.input_file_dir,\n",
60
+ " save_file= config.save_file,\n",
61
+ " model_prefix= params.model_prefix,\n",
62
+ " model_type= params.model_type,\n",
63
+ " vocab_size= params.vocab_size\n",
64
+ " )\n",
65
+ " return train_token_config"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 20,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "import sentencepiece as spm\n",
75
+ "from src.benglasummarization.logging import logger\n",
76
+ "from tqdm.notebook import tqdm\n",
77
+ "import os\n",
78
+ "\n",
79
+ "class TrainTokenize:\n",
80
+ " def __init__(self, config: BanTokenTrainConfig):\n",
81
+ " self.config = config\n",
82
+ " \n",
83
+ " def train_tokenizer(self):\n",
84
+ " with open(self.config.input_file_dir, 'r', encoding='utf-8') as f:\n",
85
+ " total_lines = sum(1 for line in f)\n",
86
+ "\n",
87
+ " with tqdm(total=total_lines, desc='Preparing Sentence for Training', unit='lines') as pbar:\n",
88
+ " with open(self.config.input_file_dir, 'r', encoding='utf-8') as f:\n",
89
+ " for _ in f:\n",
90
+ " pbar.update(1)\n",
91
+ " \n",
92
+ " # Ensure the save directory exists\n",
93
+ " os.makedirs(os.path.dirname(self.config.save_file), exist_ok=True)\n",
94
+ " \n",
95
+ " # Training Arguments\n",
96
+ " train_params = {\n",
97
+ " 'input': str(self.config.input_file_dir),\n",
98
+ " 'model_prefix': os.path.join(self.config.save_file, self.config.model_prefix),\n",
99
+ " 'vocab_size': self.config.vocab_size,\n",
100
+ " 'model_type': self.config.model_type,\n",
101
+ " 'character_coverage': 1.0,\n",
102
+ " 'input_sentence_size': 1000000,\n",
103
+ " 'shuffle_input_sentence': True\n",
104
+ " }\n",
105
+ " \n",
106
+ " spm.SentencePieceTrainer.train(**train_params)\n",
107
+ " logger.info(f'Tokenizer model saved to {train_params[\"model_prefix\"]}.model')\n",
108
+ " logger.info(f'Tokenizer vocabulary saved to {train_params[\"model_prefix\"]}.vocab')\n",
109
+ " \n",
110
+ " "
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 21,
116
+ "metadata": {},
117
+ "outputs": [
118
+ {
119
+ "name": "stdout",
120
+ "output_type": "stream",
121
+ "text": [
122
+ "[2024-10-16 20:25:26,476: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
123
+ "[2024-10-16 20:25:26,477: INFO: common: yaml file: params.yaml loaded successfully]\n",
124
+ "[2024-10-16 20:25:26,478: INFO: common: created directory at: artifacts]\n",
125
+ "[2024-10-16 20:25:26,480: INFO: common: created directory at: artifacts/train_tokenization]\n"
126
+ ]
127
+ },
128
+ {
129
+ "data": {
130
+ "application/vnd.jupyter.widget-view+json": {
131
+ "model_id": "57e6c332ff144237a7683e64bf137c3c",
132
+ "version_major": 2,
133
+ "version_minor": 0
134
+ },
135
+ "text/plain": [
136
+ "Preparing Sentence for Training: 0%| | 0/160000 [00:00<?, ?lines/s]"
137
+ ]
138
+ },
139
+ "metadata": {},
140
+ "output_type": "display_data"
141
+ },
142
+ {
143
+ "name": "stdout",
144
+ "output_type": "stream",
145
+ "text": [
146
+ "[2024-10-16 20:26:03,153: INFO: 489807411: Tokenizer model saved to artifacts/train_tokenization\\cbengali_tokenizer.model]\n",
147
+ "[2024-10-16 20:26:03,154: INFO: 489807411: Tokenizer vocabulary saved to artifacts/train_tokenization\\cbengali_tokenizer.vocab]\n"
148
+ ]
149
+ }
150
+ ],
151
+ "source": [
152
+ "try:\n",
153
+ " config = ConfigurationManager()\n",
154
+ " train_token_config = config.get_train_token_config()\n",
155
+ " train_config = TrainTokenize(config=train_token_config)\n",
156
+ " train_config.train_tokenizer()\n",
157
+ "except Exception as e:\n",
158
+ " logger.error(f\"An error occurred: {str(e)}\")\n",
159
+ " raise e"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": []
168
+ }
169
+ ],
170
+ "metadata": {
171
+ "kernelspec": {
172
+ "display_name": "Python 3",
173
+ "language": "python",
174
+ "name": "python3"
175
+ },
176
+ "language_info": {
177
+ "codemirror_mode": {
178
+ "name": "ipython",
179
+ "version": 3
180
+ },
181
+ "file_extension": ".py",
182
+ "mimetype": "text/x-python",
183
+ "name": "python",
184
+ "nbconvert_exporter": "python",
185
+ "pygments_lexer": "ipython3",
186
+ "version": "3.11.0"
187
+ }
188
+ },
189
+ "nbformat": 4,
190
+ "nbformat_minor": 2
191
+ }
src/benglasummarization/components/train_bn_token.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+ from src.benglasummarization.logging import logger
3
+ from tqdm.notebook import tqdm
4
+ import os
5
+ from src.benglasummarization.entity.config_entity import BanTokenTrainConfig
6
+ class TrainTokenize:
7
+ def __init__(self, config: BanTokenTrainConfig):
8
+ self.config = config
9
+
10
+ def train_tokenizer(self):
11
+ with open(self.config.input_file_dir, 'r', encoding='utf-8') as f:
12
+ total_lines = sum(1 for line in f)
13
+
14
+ with tqdm(total=total_lines, desc='Preparing Sentence for Training', unit='lines') as pbar:
15
+ with open(self.config.input_file_dir, 'r', encoding='utf-8') as f:
16
+ for _ in f:
17
+ pbar.update(1)
18
+
19
+ # Ensure the save directory exists
20
+ os.makedirs(os.path.dirname(self.config.save_file), exist_ok=True)
21
+
22
+ # Training Arguments
23
+ train_params = {
24
+ 'input': str(self.config.input_file_dir),
25
+ 'model_prefix': os.path.join(self.config.save_file, self.config.model_prefix),
26
+ 'vocab_size': self.config.vocab_size,
27
+ 'model_type': self.config.model_type,
28
+ 'character_coverage': 1.0,
29
+ 'input_sentence_size': 1000000,
30
+ 'shuffle_input_sentence': True
31
+ }
32
+
33
+ spm.SentencePieceTrainer.train(**train_params)
34
+ logger.info(f'Tokenizer model saved to {train_params["model_prefix"]}.model')
35
+ logger.info(f'Tokenizer vocabulary saved to {train_params["model_prefix"]}.vocab')
36
+
37
+
src/benglasummarization/config/configuration.py CHANGED
@@ -2,6 +2,7 @@ from src.benglasummarization.constants import *
2
  from src.benglasummarization.utils.common import read_yaml, create_directories
3
  from benglasummarization.entity.config_entity import DataIngestionConfig
4
  from src.benglasummarization.entity.config_entity import BanTokenizationConfig
 
5
  class ConfigurationManager:
6
  def __init__(
7
  self,
@@ -29,7 +30,7 @@ class ConfigurationManager:
29
 
30
  def get_ben_tokenization_config(self) -> BanTokenizationConfig:
31
  config = self.config.ban_tokenization
32
- params = self.params
33
  create_directories([config.root_dir])
34
 
35
  ben_tokenization_config = BanTokenizationConfig(
@@ -39,4 +40,20 @@ class ConfigurationManager:
39
  output_file= params.output_file
40
  )
41
 
42
- return ben_tokenization_config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from src.benglasummarization.utils.common import read_yaml, create_directories
3
  from benglasummarization.entity.config_entity import DataIngestionConfig
4
  from src.benglasummarization.entity.config_entity import BanTokenizationConfig
5
+ from src.benglasummarization.entity.config_entity import BanTokenTrainConfig
6
  class ConfigurationManager:
7
  def __init__(
8
  self,
 
30
 
31
  def get_ben_tokenization_config(self) -> BanTokenizationConfig:
32
  config = self.config.ban_tokenization
33
+ params = self.params.pre_tokenize
34
  create_directories([config.root_dir])
35
 
36
  ben_tokenization_config = BanTokenizationConfig(
 
40
  output_file= params.output_file
41
  )
42
 
43
+ return ben_tokenization_config
44
+
45
+
46
+ def get_train_token_config(self) -> BanTokenTrainConfig:
47
+ config = self.config.train_tokenize
48
+ params = self.params.train_tokenize
49
+ create_directories([config.root_dir])
50
+
51
+ train_token_config = BanTokenTrainConfig(
52
+ root_dir= config.root_dir,
53
+ input_file_dir= config.input_file_dir,
54
+ save_file= config.save_file,
55
+ model_prefix= params.model_prefix,
56
+ model_type= params.model_type,
57
+ vocab_size= params.vocab_size
58
+ )
59
+ return train_token_config
src/benglasummarization/entity/config_entity.py CHANGED
@@ -15,4 +15,13 @@ class BanTokenizationConfig:
15
  source_dir : Path
16
  save_dir : Path
17
  output_file : str
 
 
 
 
 
 
 
 
 
18
 
 
15
  source_dir : Path
16
  save_dir : Path
17
  output_file : str
18
+
19
+ @dataclass(frozen=True)
20
+ class BanTokenTrainConfig:
21
+ root_dir : Path
22
+ input_file_dir : Path
23
+ save_file : Path
24
+ model_prefix : str
25
+ model_type : str
26
+ vocab_size : int
27
 
src/benglasummarization/pipeline/stage_03_train_ban_token.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.benglasummarization.config.configuration import ConfigurationManager
2
+ from src.benglasummarization.components.train_bn_token import TrainTokenize
3
+
4
+ class TrainTokenizePipeLine:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def main(self):
9
+ config = ConfigurationManager()
10
+ train_ban_tok = config.get_train_token_config()
11
+ train_tok = TrainTokenize(config=train_ban_tok)
12
+ train_tok.train_tokenizer()
13
+