Spaces:

Md-Hakim
/

bengali-text-summarization

Sleeping

File size: 5,757 Bytes

9a56158

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.chdir('../')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'e:\\\\bengla text summarization\\\\train-pegasus-model-on-bengali-text-summarization-using-mlops'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "from pathlib import Path\n",
    "\n",
    "@dataclass(frozen=True)\n",
    "class BanTokenizationConfig:\n",
    "    root_dir : Path\n",
    "    source_dir : Path\n",
    "    save_dir : Path\n",
    "    output_file : str\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.benglasummarization.constants import *\n",
    "from src.benglasummarization.utils.common import  create_directories, read_yaml\n",
    "\n",
    "class ConfigurationManager:\n",
    "    def __init__(\n",
    "        self,\n",
    "        config_filepath = CONFIG_FILE_PATH,\n",
    "        params_filepath = PARAMS_FILE_PATH):\n",
    "\n",
    "        self.config = read_yaml(config_filepath)\n",
    "        self.params = read_yaml(params_filepath)\n",
    "\n",
    "        create_directories([self.config.artifacts_root])\n",
    "\n",
    "    def get_ben_tokenization_config(self) -> BanTokenizationConfig:\n",
    "        config = self.config.ban_tokenization\n",
    "        params = self.params.pre_tokenize\n",
    "        create_directories([config.root_dir])\n",
    "\n",
    "        ben_tokenization_config = BanTokenizationConfig(\n",
    "            root_dir=config.root_dir,\n",
    "            source_dir=config.source_dir,\n",
    "            save_dir= config.save_dir,\n",
    "            output_file= params.output_file\n",
    "        )\n",
    " \n",
    "        return ben_tokenization_config\n",
    "\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from src.benglasummarization.logging import logger\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "class BanTokenization:\n",
    "    def __init__(self, config: BanTokenizationConfig):\n",
    "        self.config = config\n",
    "\n",
    "    def combine_text_columns(self, text_columns=['main']):\n",
    "        df = pd.read_csv(self.config.source_dir)\n",
    "\n",
    "        # Ensure save_dir is a Path object\n",
    "        save_dir = Path(self.config.save_dir)\n",
    "        \n",
    "        # Create the directory if it doesn't exist\n",
    "        save_dir.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "        # Combine save_dir and output_file to form the output path\n",
    "        output_txt_file = save_dir / self.config.output_file\n",
    "        \n",
    "        # Write the combined text data to the output file\n",
    "        with open(output_txt_file, 'w', encoding='utf-8') as f:\n",
    "            for index, row in tqdm(df.iterrows(), total=len(df)):\n",
    "                combined_text = ' '.join(str(row[col]) for col in text_columns)\n",
    "                f.write(combined_text + '\\n')\n",
    "\n",
    "        # Log the success of the operation\n",
    "        logger.info(f\"All text data has been combined into {output_txt_file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-16 19:09:09,141: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
      "[2024-10-16 19:09:09,143: INFO: common: yaml file: params.yaml loaded successfully]\n",
      "[2024-10-16 19:09:09,145: INFO: common: created directory at: artifacts]\n",
      "[2024-10-16 19:09:09,146: INFO: common: created directory at: artifacts/ban_tokenization]\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "46422977ab65463695c98b98ece484c2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/160000 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-16 19:10:00,660: INFO: 206824922: All text data has been combined into artifacts\\ban_tokenization\\combined_text.txt]\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    config = ConfigurationManager()\n",
    "    prepare_ben_tok_config = config.get_ben_tokenization_config()  \n",
    "    ben_data_tok = BanTokenization(config=prepare_ben_tok_config)\n",
    "    ben_data_tok.combine_text_columns()\n",
    "except Exception as e:\n",
    "    raise e"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}