HAMIM-ML commited on
Commit
a0f1e38
·
1 Parent(s): 7fd4eae

data ingestion added

Browse files
config/config.yaml CHANGED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ artifacts_root: artifacts
2
+
3
+ data_ingestion:
4
+ root_dir: artifacts/data_ingestion
5
+ source_dir: C:\\mlops project\\archive.zip
6
+ local_data_file : artifacts/data_ingestion/archive.zip
7
+ unzip_dir : artifacts/data_ingestion
main.py CHANGED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.imagecolorization.pipeline.stage01_data_ingestion import DataIngestionPipeline
2
+ from src.imagecolorization.logging import logger
3
+
4
+ STAGE_NAME = 'Data Ingestion Config'
5
+
6
+ try:
7
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
8
+ data_ingestion = DataIngestionPipeline()
9
+ data_ingestion.main()
10
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
11
+ except Exception as e:
12
+ logger.exception(e)
13
+ raise e
params.yaml CHANGED
@@ -0,0 +1 @@
 
 
1
+ key: val
research/data_ingestion.ipynb ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "os.chdir('../')"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "'c:\\\\mlops project\\\\image-colorization-mlops'"
22
+ ]
23
+ },
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "%pwd"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from dataclasses import dataclass\n",
40
+ "from pathlib import Path\n",
41
+ "\n",
42
+ "@dataclass(frozen=True)\n",
43
+ "class DataIngestionConfig:\n",
44
+ " root_dir : Path\n",
45
+ " source_dir : Path\n",
46
+ " local_data_file: Path\n",
47
+ " unzip_dir : Path"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 4,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "from src.imagecolorization.constants import *\n",
57
+ "from src.imagecolorization.utils.common import read_yaml, create_directories\n",
58
+ "\n",
59
+ "class ConfigurationManager:\n",
60
+ " def __init__(\n",
61
+ " self,\n",
62
+ " config_filepath = CONFIG_FILE_PATH,\n",
63
+ " params_filepath = PARAMS_FILE_PATH):\n",
64
+ "\n",
65
+ " self.config = read_yaml(config_filepath)\n",
66
+ " self.params = read_yaml(params_filepath)\n",
67
+ "\n",
68
+ " create_directories([self.config.artifacts_root])\n",
69
+ "\n",
70
+ " \n",
71
+ "\n",
72
+ " def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
73
+ " config = self.config.data_ingestion\n",
74
+ "\n",
75
+ " create_directories([config.root_dir])\n",
76
+ "\n",
77
+ " data_ingestion_config = DataIngestionConfig(\n",
78
+ " root_dir=config.root_dir,\n",
79
+ " source_dir=config.source_dir,\n",
80
+ " local_data_file=config.local_data_file,\n",
81
+ " unzip_dir=config.unzip_dir \n",
82
+ " )\n",
83
+ "\n",
84
+ " return data_ingestion_config\n",
85
+ " \n",
86
+ " \n",
87
+ " "
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 5,
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "import os\n",
97
+ "import zipfile\n",
98
+ "from src.imagecolorization.logging import logger\n",
99
+ "from tqdm.notebook import tqdm\n",
100
+ "from dataclasses import replace\n",
101
+ "\n",
102
+ "class DataIngestion:\n",
103
+ " def __init__(self, config: DataIngestionConfig):\n",
104
+ " self.config = config\n",
105
+ " \n",
106
+ " def load_file(self):\n",
107
+ " if os.path.exists(self.config.source_dir):\n",
108
+ " self.config = replace(self.config, local_data_file=self.config.source_dir)\n",
109
+ " logger.info(f'File Found at: {self.config.local_data_file}')\n",
110
+ " else:\n",
111
+ " logger.info(f\"File not found at {self.config.source_dir}\")\n",
112
+ " raise FileNotFoundError(f'No file found at {self.config.source_dir}')\n",
113
+ " \n",
114
+ " \n",
115
+ " \n",
116
+ " def extract_zip_file(self):\n",
117
+ " unzip_path = self.config.unzip_dir\n",
118
+ " os.makedirs(unzip_path, exist_ok=True)\n",
119
+ " \n",
120
+ " # open the zip file\n",
121
+ " with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:\n",
122
+ " total_files = len(zip_ref.infolist())\n",
123
+ " \n",
124
+ " for file in tqdm(iterable=zip_ref.infolist(), total=total_files, desc='Extracting files'):\n",
125
+ " zip_ref.extract(member=file, path=unzip_path)\n",
126
+ " \n",
127
+ " logger.info(f'Extacted {self.config.local_data_file} to {unzip_path}')"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 6,
133
+ "metadata": {},
134
+ "outputs": [
135
+ {
136
+ "name": "stdout",
137
+ "output_type": "stream",
138
+ "text": [
139
+ "[2024-08-18 02:08:07,443: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
140
+ "[2024-08-18 02:08:07,444: INFO: common: yaml file: params.yaml loaded successfully]\n",
141
+ "[2024-08-18 02:08:07,445: INFO: common: created directory at: artifacts]\n",
142
+ "[2024-08-18 02:08:07,446: INFO: common: created directory at: artifacts/data_ingestion]\n",
143
+ "[2024-08-18 02:08:07,446: INFO: 2749353352: File Found at: C:\\\\mlops project\\\\archive.zip]\n"
144
+ ]
145
+ },
146
+ {
147
+ "data": {
148
+ "application/vnd.jupyter.widget-view+json": {
149
+ "model_id": "b91326c533ac4f588a5224910549cd65",
150
+ "version_major": 2,
151
+ "version_minor": 0
152
+ },
153
+ "text/plain": [
154
+ "Extracting files: 0%| | 0/5 [00:00<?, ?it/s]"
155
+ ]
156
+ },
157
+ "metadata": {},
158
+ "output_type": "display_data"
159
+ },
160
+ {
161
+ "name": "stdout",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "[2024-08-18 02:08:34,311: INFO: 2749353352: Extacted C:\\\\mlops project\\\\archive.zip to artifacts/data_ingestion]\n"
165
+ ]
166
+ }
167
+ ],
168
+ "source": [
169
+ "try:\n",
170
+ " config = ConfigurationManager()\n",
171
+ " data_ingestion_config = config.get_data_ingestion_config()\n",
172
+ " data_ingestion = DataIngestion(config=data_ingestion_config)\n",
173
+ " data_ingestion.load_file()\n",
174
+ " data_ingestion.extract_zip_file()\n",
175
+ "except Exception as e:\n",
176
+ " raise e"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": null,
182
+ "metadata": {},
183
+ "outputs": [],
184
+ "source": []
185
+ }
186
+ ],
187
+ "metadata": {
188
+ "kernelspec": {
189
+ "display_name": "Python 3",
190
+ "language": "python",
191
+ "name": "python3"
192
+ },
193
+ "language_info": {
194
+ "codemirror_mode": {
195
+ "name": "ipython",
196
+ "version": 3
197
+ },
198
+ "file_extension": ".py",
199
+ "mimetype": "text/x-python",
200
+ "name": "python",
201
+ "nbconvert_exporter": "python",
202
+ "pygments_lexer": "ipython3",
203
+ "version": "3.11.0"
204
+ }
205
+ },
206
+ "nbformat": 4,
207
+ "nbformat_minor": 2
208
+ }
src/imagecolorization/config/configuration.py CHANGED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.imagecolorization.constants import *
2
+ from src.imagecolorization.utils.common import read_yaml, create_directories
3
+ from src.imagecolorization.entity.config_entity import DataIngestionConfig
4
+ class ConfigurationManager:
5
+ def __init__(
6
+ self,
7
+ config_filepath = CONFIG_FILE_PATH,
8
+ params_filepath = PARAMS_FILE_PATH):
9
+
10
+ self.config = read_yaml(config_filepath)
11
+ self.params = read_yaml(params_filepath)
12
+
13
+ create_directories([self.config.artifacts_root])
14
+
15
+
16
+
17
+ def get_data_ingestion_config(self) -> DataIngestionConfig:
18
+ config = self.config.data_ingestion
19
+
20
+ create_directories([config.root_dir])
21
+
22
+ data_ingestion_config = DataIngestionConfig(
23
+ root_dir=config.root_dir,
24
+ source_dir=config.source_dir,
25
+ local_data_file=config.local_data_file,
26
+ unzip_dir=config.unzip_dir
27
+ )
28
+
29
+ return data_ingestion_config
30
+
31
+
32
+
src/imagecolorization/conponents/__init__.py ADDED
File without changes
src/imagecolorization/conponents/data_ingestion.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ from src.imagecolorization.logging import logger
4
+ from tqdm.notebook import tqdm
5
+ from dataclasses import replace
6
+ from src.imagecolorization.entity.config_entity import DataIngestionConfig
7
+
8
+ class DataIngestion:
9
+ def __init__(self, config: DataIngestionConfig):
10
+ self.config = config
11
+
12
+ def load_file(self):
13
+ if os.path.exists(self.config.source_dir):
14
+ self.config = replace(self.config, local_data_file=self.config.source_dir)
15
+ logger.info(f'File Found at: {self.config.local_data_file}')
16
+ else:
17
+ logger.info(f"File not found at {self.config.source_dir}")
18
+ raise FileNotFoundError(f'No file found at {self.config.source_dir}')
19
+
20
+
21
+
22
+ def extract_zip_file(self):
23
+ unzip_path = self.config.unzip_dir
24
+ os.makedirs(unzip_path, exist_ok=True)
25
+
26
+ # open the zip file
27
+ with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
28
+ total_files = len(zip_ref.infolist())
29
+
30
+ for file in tqdm(iterable=zip_ref.infolist(), total=total_files, desc='Extracting files'):
31
+ zip_ref.extract(member=file, path=unzip_path)
32
+
33
+ logger.info(f'Extacted {self.config.local_data_file} to {unzip_path}')
src/imagecolorization/constants/__init__.py CHANGED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ CONFIG_FILE_PATH = Path('config/config.yaml')
4
+ PARAMS_FILE_PATH = Path('params.yaml')
src/imagecolorization/entity/config_entity.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+ @dataclass(frozen=True)
5
+ class DataIngestionConfig:
6
+ root_dir : Path
7
+ source_dir : Path
8
+ local_data_file: Path
9
+ unzip_dir : Path
src/imagecolorization/logging/__init__.py CHANGED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+
5
+ logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
6
+ log_dir = "logs"
7
+ log_filepath = os.path.join(log_dir,"running_logs.log")
8
+ os.makedirs(log_dir, exist_ok=True)
9
+
10
+
11
+
12
+ logging.basicConfig(
13
+ level= logging.INFO,
14
+ format= logging_str,
15
+
16
+ handlers=[
17
+ logging.FileHandler(log_filepath),
18
+ logging.StreamHandler(sys.stdout)
19
+ ]
20
+ )
21
+
22
+ logger = logging.getLogger("imagecolorizationLogger")
src/imagecolorization/pipeline/stage01_data_ingestion.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.imagecolorization.conponents.data_ingestion import DataIngestion
2
+ from src.imagecolorization.config.configuration import ConfigurationManager
3
+
4
+ class DataIngestionPipeline:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def main(self):
9
+ config = ConfigurationManager()
10
+ data_ingestion_config = config.get_data_ingestion_config()
11
+ data_ingesion = DataIngestion(config=data_ingestion_config)
12
+ data_ingesion.load_file()
13
+ data_ingesion.extract_zip_file()
src/imagecolorization/utils/common.py CHANGED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from box.exceptions import BoxValueError
3
+ import yaml
4
+ from src.imagecolorization.logging import logger
5
+ from ensure import ensure_annotations
6
+ from box import ConfigBox
7
+ from pathlib import Path
8
+ from typing import Any
9
+ import json
10
+
11
+
12
+ @ensure_annotations
13
+ def read_yaml(path_to_yaml: Path) -> ConfigBox:
14
+ """reads yaml file and returns
15
+
16
+ Args:
17
+ path_to_yaml (str): path like input
18
+
19
+ Raises:
20
+ ValueError: if yaml file is empty
21
+ e: empty file
22
+
23
+ Returns:
24
+ ConfigBox: ConfigBox type
25
+ """
26
+ try:
27
+ with open(path_to_yaml) as yaml_file:
28
+ content = yaml.safe_load(yaml_file)
29
+ logger.info(f"yaml file: {path_to_yaml} loaded successfully")
30
+ return ConfigBox(content)
31
+ except BoxValueError:
32
+ raise ValueError("yaml file is empty")
33
+ except Exception as e:
34
+ raise e
35
+
36
+
37
+ @ensure_annotations
38
+ def create_directories(path_to_directories: list, verbose=True):
39
+ """create list of directories
40
+
41
+ Args:
42
+ path_to_directories (list): list of path of directories
43
+ ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
44
+ """
45
+ for path in path_to_directories:
46
+ os.makedirs(path, exist_ok=True)
47
+ if verbose:
48
+ logger.info(f"created directory at: {path}")
49
+
50
+
51
+ @ensure_annotations
52
+ def save_json(path: Path, data: dict):
53
+ """save json data
54
+
55
+ Args:
56
+ path (Path): path to json file
57
+ data (dict): data to be saved in json file
58
+ """
59
+
60
+
61
+ with open(path, 'w') as f:
62
+ json.dump(data, f, indent=4)
63
+
64
+ logger.info(f'Json file saved at: {path}')
65
+
66
+
67
+ @ensure_annotations
68
+ def load_json(path: Path) -> ConfigBox:
69
+ """load json files data
70
+
71
+ Args:
72
+ path (Path): path to json file
73
+
74
+ Returns:
75
+ ConfigBox: data as class attributes instead of dict
76
+ """
77
+
78
+ with open(path, 'r') as f:
79
+ content = json.load(f)
80
+
81
+ logger.info(f"Json file loaded successfully from: {path}")
82
+ return ConfigBox
83
+