{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.chdir('../')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'e:\\\\bengla text summarization\\\\train-pegasus-model-on-bengali-text-summarization-using-mlops'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%pwd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "from pathlib import Path\n", "\n", "@dataclass(frozen=True)\n", "class BanTokenizationConfig:\n", " root_dir : Path\n", " source_dir : Path\n", " save_dir : Path\n", " output_file : str\n", " \n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from src.benglasummarization.constants import *\n", "from src.benglasummarization.utils.common import create_directories, read_yaml\n", "\n", "class ConfigurationManager:\n", " def __init__(\n", " self,\n", " config_filepath = CONFIG_FILE_PATH,\n", " params_filepath = PARAMS_FILE_PATH):\n", "\n", " self.config = read_yaml(config_filepath)\n", " self.params = read_yaml(params_filepath)\n", "\n", " create_directories([self.config.artifacts_root])\n", "\n", " def get_ben_tokenization_config(self) -> BanTokenizationConfig:\n", " config = self.config.ban_tokenization\n", " params = self.params.pre_tokenize\n", " create_directories([config.root_dir])\n", "\n", " ben_tokenization_config = BanTokenizationConfig(\n", " root_dir=config.root_dir,\n", " source_dir=config.source_dir,\n", " save_dir= config.save_dir,\n", " output_file= params.output_file\n", " )\n", " \n", " return ben_tokenization_config\n", "\n", " " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "from src.benglasummarization.logging import logger\n", "from tqdm.notebook import tqdm\n", "\n", "class BanTokenization:\n", " def __init__(self, config: BanTokenizationConfig):\n", " self.config = config\n", "\n", " def combine_text_columns(self, text_columns=['main']):\n", " df = pd.read_csv(self.config.source_dir)\n", "\n", " # Ensure save_dir is a Path object\n", " save_dir = Path(self.config.save_dir)\n", " \n", " # Create the directory if it doesn't exist\n", " save_dir.mkdir(parents=True, exist_ok=True)\n", "\n", " # Combine save_dir and output_file to form the output path\n", " output_txt_file = save_dir / self.config.output_file\n", " \n", " # Write the combined text data to the output file\n", " with open(output_txt_file, 'w', encoding='utf-8') as f:\n", " for index, row in tqdm(df.iterrows(), total=len(df)):\n", " combined_text = ' '.join(str(row[col]) for col in text_columns)\n", " f.write(combined_text + '\\n')\n", "\n", " # Log the success of the operation\n", " logger.info(f\"All text data has been combined into {output_txt_file}\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-16 19:09:09,141: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", "[2024-10-16 19:09:09,143: INFO: common: yaml file: params.yaml loaded successfully]\n", "[2024-10-16 19:09:09,145: INFO: common: created directory at: artifacts]\n", "[2024-10-16 19:09:09,146: INFO: common: created directory at: artifacts/ban_tokenization]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "46422977ab65463695c98b98ece484c2", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/160000 [00:00