logicsame commited on
Commit
3ced35f
·
1 Parent(s): 9050a12

prepare ben token added

Browse files
config/config.yaml CHANGED
@@ -6,3 +6,9 @@ data_ingestion:
6
  local_data_file : artifacts/data_ingestion/BanSum.zip
7
  unzip_dir : artifacts/data_ingestion
8
 
 
 
 
 
 
 
 
6
  local_data_file : artifacts/data_ingestion/BanSum.zip
7
  unzip_dir : artifacts/data_ingestion
8
 
9
+ ban_tokenization:
10
+ root_dir: artifacts/ban_tokenization
11
+ source_dir: artifacts/data_ingestion/BanSum.csv
12
+ save_dir: artifacts/ban_tokenization
13
+
14
+
main.py CHANGED
@@ -1,5 +1,6 @@
1
  from src.benglasummarization.logging import logger
2
  from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
 
3
 
4
  STAGE_NAME = 'Data Ingestion Stage'
5
 
@@ -8,6 +9,17 @@ try:
8
  data_ingestion = DataIngestionPipeline()
9
  data_ingestion.main()
10
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
 
 
 
 
 
 
 
 
 
 
 
11
  except Exception as e:
12
  logger.exception(e)
13
  raise e
 
1
  from src.benglasummarization.logging import logger
2
  from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
3
+ from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
4
 
5
  STAGE_NAME = 'Data Ingestion Stage'
6
 
 
9
  data_ingestion = DataIngestionPipeline()
10
  data_ingestion.main()
11
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
12
+ except Exception as e:
13
+ logger.exception(e)
14
+ raise e
15
+
16
+ STAGE_NAME = 'Prepare Ban Tokeniation Stage'
17
+
18
+ try:
19
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
20
+ Ban_Token = BenTokenizationPreparePipeLine()
21
+ Ban_Token.main()
22
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
23
  except Exception as e:
24
  logger.exception(e)
25
  raise e
params.yaml CHANGED
@@ -1 +1,2 @@
1
- key : val
 
 
1
+
2
+ output_file: "combined_text.txt"
research/data_ingestion.ipynb CHANGED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "'e:\\\\bengla text summarization\\\\train-pegasus-model-on-bengali-text-summarization-using-mlops'"
22
+ ]
23
+ },
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "%pwd"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from dataclasses import dataclass\n",
40
+ "from pathlib import Path\n",
41
+ "\n",
42
+ "@dataclass(frozen=True)\n",
43
+ "class DataIngestionConfig:\n",
44
+ " root_dir : Path\n",
45
+ " source_dir : Path\n",
46
+ " local_data_file : Path\n",
47
+ " unzip_dir : Path\n",
48
+ " \n"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 4,
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "from src.benglasummarization.constants import *\n",
58
+ "from src.benglasummarization.utils.common import read_yaml, create_directories\n",
59
+ "class ConfigurationManager:\n",
60
+ " def __init__(\n",
61
+ " self, \n",
62
+ " config_filepath = CONFIG_FILE_PATH,\n",
63
+ " params_filepath = PARAMS_FILE_PATH\n",
64
+ " ):\n",
65
+ " self.config = read_yaml(config_filepath)\n",
66
+ " self.params = read_yaml(params_filepath)\n",
67
+ " \n",
68
+ " create_directories([self.config.artifacts_root])\n",
69
+ " \n",
70
+ " def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
71
+ " config = self.config.data_ingestion\n",
72
+ " create_directories([config.root_dir])\n",
73
+ " \n",
74
+ " data_ingestion_config = DataIngestionConfig(\n",
75
+ " root_dir= config.root_dir,\n",
76
+ " source_dir=config.source_dir,\n",
77
+ " local_data_file=config.local_data_file,\n",
78
+ " unzip_dir= config.unzip_dir\n",
79
+ " )\n",
80
+ " \n",
81
+ " return data_ingestion_config"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 10,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "import os\n",
91
+ "import zipfile\n",
92
+ "from src.benglasummarization.logging import logger\n",
93
+ "from tqdm.notebook import tqdm\n",
94
+ "from dataclasses import replace\n",
95
+ "\n",
96
+ "class DataIngestion:\n",
97
+ " def __init__(self, config : DataIngestionConfig):\n",
98
+ " self.config = config\n",
99
+ " \n",
100
+ " def load_file(self):\n",
101
+ " if os.path.exists(self.config.source_dir):\n",
102
+ " self.config = replace(self.config, local_data_file = self.config.source_dir)\n",
103
+ " logger.info(f'File found at: {self.config.local_data_file}')\n",
104
+ " else:\n",
105
+ " logger.info(f'File not found at: {self.config.source_dir}')\n",
106
+ " raise FileNotFoundError(f'No file found at: {self.config.source_dir}')\n",
107
+ " \n",
108
+ " def extract_zip_file(self):\n",
109
+ " unzip_path = self.config.unzip_dir\n",
110
+ " os.makedirs(unzip_path, exist_ok=True)\n",
111
+ " \n",
112
+ " with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:\n",
113
+ " total_files = len(zip_ref.infolist())\n",
114
+ " for file in tqdm(iterable=zip_ref.infolist(), total=total_files, desc = 'Extracting Files'):\n",
115
+ " zip_ref.extract(member = file, path = unzip_path)\n",
116
+ " \n",
117
+ " logger.info(f\"Extracted {self.config.local_data_file} to {unzip_path}\")\n",
118
+ " \n",
119
+ " \n",
120
+ " \n",
121
+ " \n",
122
+ " \n",
123
+ " "
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 11,
129
+ "metadata": {},
130
+ "outputs": [
131
+ {
132
+ "name": "stdout",
133
+ "output_type": "stream",
134
+ "text": [
135
+ "[2024-10-16 02:22:48,187: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
136
+ "[2024-10-16 02:22:48,189: INFO: common: yaml file: params.yaml loaded successfully]\n",
137
+ "[2024-10-16 02:22:48,192: INFO: common: created directory at: artifacts]\n",
138
+ "[2024-10-16 02:22:48,196: INFO: common: created directory at: artifacts/data_ingestion]\n",
139
+ "[2024-10-16 02:22:48,198: INFO: 2796563959: File found at: E:\\\\bengla text summarization\\BanSum.zip]\n"
140
+ ]
141
+ },
142
+ {
143
+ "data": {
144
+ "application/vnd.jupyter.widget-view+json": {
145
+ "model_id": "e0cd102bb64748cdb7dfe8d840c14a48",
146
+ "version_major": 2,
147
+ "version_minor": 0
148
+ },
149
+ "text/plain": [
150
+ "Extracting Files: 0%| | 0/1 [00:00<?, ?it/s]"
151
+ ]
152
+ },
153
+ "metadata": {},
154
+ "output_type": "display_data"
155
+ },
156
+ {
157
+ "name": "stdout",
158
+ "output_type": "stream",
159
+ "text": [
160
+ "[2024-10-16 02:23:01,006: INFO: 2796563959: Extracted E:\\\\bengla text summarization\\BanSum.zip to artifacts/data_ingestion]\n"
161
+ ]
162
+ }
163
+ ],
164
+ "source": [
165
+ "try:\n",
166
+ " config = ConfigurationManager()\n",
167
+ " data_ingestion_config = config.get_data_ingestion_config()\n",
168
+ " data_ingestion = DataIngestion(config=data_ingestion_config)\n",
169
+ " data_ingestion.load_file()\n",
170
+ " data_ingestion.extract_zip_file()\n",
171
+ "except Exception as e:\n",
172
+ " raise e"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": null,
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": []
181
+ }
182
+ ],
183
+ "metadata": {
184
+ "kernelspec": {
185
+ "display_name": "Python 3",
186
+ "language": "python",
187
+ "name": "python3"
188
+ },
189
+ "language_info": {
190
+ "codemirror_mode": {
191
+ "name": "ipython",
192
+ "version": 3
193
+ },
194
+ "file_extension": ".py",
195
+ "mimetype": "text/x-python",
196
+ "name": "python",
197
+ "nbconvert_exporter": "python",
198
+ "pygments_lexer": "ipython3",
199
+ "version": "3.11.0"
200
+ }
201
+ },
202
+ "nbformat": 4,
203
+ "nbformat_minor": 2
204
+ }
research/prepare_ben_tokenization.ipynb ADDED
File without changes
src/benglasummarization/components/prepare_ben_token.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from src.benglasummarization.logging import logger
4
+ from tqdm.notebook import tqdm
5
+ from src.benglasummarization.entity.config_entity import BanTokenizationConfig
6
+ class BanTokenization:
7
+ def __init__(self, config: BanTokenizationConfig):
8
+ self.config = config
9
+
10
+ def combine_text_columns(self, text_columns=['main']):
11
+ df = pd.read_csv(self.config.source_dir)
12
+
13
+ # Ensure save_dir is a Path object
14
+ save_dir = Path(self.config.save_dir)
15
+
16
+ # Create the directory if it doesn't exist
17
+ save_dir.mkdir(parents=True, exist_ok=True)
18
+
19
+ # Combine save_dir and output_file to form the output path
20
+ output_txt_file = save_dir / self.config.output_file
21
+
22
+ # Write the combined text data to the output file
23
+ with open(output_txt_file, 'w', encoding='utf-8') as f:
24
+ for index, row in tqdm(df.iterrows(), total=len(df)):
25
+ combined_text = ' '.join(str(row[col]) for col in text_columns)
26
+ f.write(combined_text + '\n')
27
+
28
+ # Log the success of the operation
29
+ logger.info(f"All text data has been combined into {output_txt_file}")
src/benglasummarization/config/configuration.py CHANGED
@@ -1,6 +1,7 @@
1
  from src.benglasummarization.constants import *
2
  from src.benglasummarization.utils.common import read_yaml, create_directories
3
  from benglasummarization.entity.config_entity import DataIngestionConfig
 
4
  class ConfigurationManager:
5
  def __init__(
6
  self,
@@ -23,4 +24,19 @@ class ConfigurationManager:
23
  unzip_dir= config.unzip_dir
24
  )
25
 
26
- return data_ingestion_config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from src.benglasummarization.constants import *
2
  from src.benglasummarization.utils.common import read_yaml, create_directories
3
  from benglasummarization.entity.config_entity import DataIngestionConfig
4
+ from src.benglasummarization.entity.config_entity import BanTokenizationConfig
5
  class ConfigurationManager:
6
  def __init__(
7
  self,
 
24
  unzip_dir= config.unzip_dir
25
  )
26
 
27
+ return data_ingestion_config
28
+
29
+
30
+ def get_ben_tokenization_config(self) -> BanTokenizationConfig:
31
+ config = self.config.ban_tokenization
32
+ params = self.params
33
+ create_directories([config.root_dir])
34
+
35
+ ben_tokenization_config = BanTokenizationConfig(
36
+ root_dir=config.root_dir,
37
+ source_dir=config.source_dir,
38
+ save_dir= config.save_dir,
39
+ output_file= params.output_file
40
+ )
41
+
42
+ return ben_tokenization_config
src/benglasummarization/entity/config_entity.py CHANGED
@@ -7,4 +7,12 @@ class DataIngestionConfig:
7
  source_dir : Path
8
  local_data_file : Path
9
  unzip_dir : Path
 
 
 
 
 
 
 
 
10
 
 
7
  source_dir : Path
8
  local_data_file : Path
9
  unzip_dir : Path
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class BanTokenizationConfig:
14
+ root_dir : Path
15
+ source_dir : Path
16
+ save_dir : Path
17
+ output_file : str
18
 
src/benglasummarization/pipeline/stage01_data_ingestion.py CHANGED
@@ -1,6 +1,5 @@
1
  from src.benglasummarization.components.data_ingestion import DataIngestion
2
  from src.benglasummarization.config.configuration import ConfigurationManager
3
- from src.benglasummarization.config.configuration import ConfigurationManager
4
 
5
  class DataIngestionPipeline:
6
  def __init__(self):
 
1
  from src.benglasummarization.components.data_ingestion import DataIngestion
2
  from src.benglasummarization.config.configuration import ConfigurationManager
 
3
 
4
  class DataIngestionPipeline:
5
  def __init__(self):
src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.benglasummarization.components.prepare_ben_token import BanTokenization
2
+ from src.benglasummarization.config.configuration import ConfigurationManager
3
+
4
+
5
+ class BenTokenizationPreparePipeLine:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def main(self):
10
+ config = ConfigurationManager()
11
+ prepare_ben_tok_config = config.get_ben_tokenization_config()
12
+ ben_data_tok = BanTokenization(config=prepare_ben_tok_config)
13
+ ben_data_tok.combine_text_columns()