{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.chdir('../')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'e:\\\\bengla text summarization\\\\train-pegasus-model-on-bengali-text-summarization-using-mlops'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%pwd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "\n", "@dataclass(frozen=True)\n", "class DataIngestionConfig:\n", " root_dir : Path\n", " source_dir : Path\n", " local_data_file : Path\n", " unzip_dir : Path\n", " \n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from src.benglasummarization.constants import *\n", "from src.benglasummarization.utils.common import read_yaml, create_directories\n", "class ConfigurationManager:\n", " def __init__(\n", " self, \n", " config_filepath = CONFIG_FILE_PATH,\n", " params_filepath = PARAMS_FILE_PATH\n", " ):\n", " self.config = read_yaml(config_filepath)\n", " self.params = read_yaml(params_filepath)\n", " \n", " create_directories([self.config.artifacts_root])\n", " \n", " def get_data_ingestion_config(self) -> DataIngestionConfig:\n", " config = self.config.data_ingestion\n", " create_directories([config.root_dir])\n", " \n", " data_ingestion_config = DataIngestionConfig(\n", " root_dir= config.root_dir,\n", " source_dir=config.source_dir,\n", " local_data_file=config.local_data_file,\n", " unzip_dir= config.unzip_dir\n", " )\n", " \n", " return data_ingestion_config" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import os\n", "import zipfile\n", "from src.benglasummarization.logging import logger\n", "from tqdm.notebook import tqdm\n", "from dataclasses import replace\n", "\n", "class DataIngestion:\n", " def __init__(self, config : DataIngestionConfig):\n", " self.config = config\n", " \n", " def load_file(self):\n", " if os.path.exists(self.config.source_dir):\n", " self.config = replace(self.config, local_data_file = self.config.source_dir)\n", " logger.info(f'File found at: {self.config.local_data_file}')\n", " else:\n", " logger.info(f'File not found at: {self.config.source_dir}')\n", " raise FileNotFoundError(f'No file found at: {self.config.source_dir}')\n", " \n", " def extract_zip_file(self):\n", " unzip_path = self.config.unzip_dir\n", " os.makedirs(unzip_path, exist_ok=True)\n", " \n", " with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:\n", " total_files = len(zip_ref.infolist())\n", " for file in tqdm(iterable=zip_ref.infolist(), total=total_files, desc = 'Extracting Files'):\n", " zip_ref.extract(member = file, path = unzip_path)\n", " \n", " logger.info(f\"Extracted {self.config.local_data_file} to {unzip_path}\")\n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-16 02:22:48,187: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", "[2024-10-16 02:22:48,189: INFO: common: yaml file: params.yaml loaded successfully]\n", "[2024-10-16 02:22:48,192: INFO: common: created directory at: artifacts]\n", "[2024-10-16 02:22:48,196: INFO: common: created directory at: artifacts/data_ingestion]\n", "[2024-10-16 02:22:48,198: INFO: 2796563959: File found at: E:\\\\bengla text summarization\\BanSum.zip]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e0cd102bb64748cdb7dfe8d840c14a48", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Extracting Files: 0%| | 0/1 [00:00