logicsame commited on
Commit
a76bfd1
·
1 Parent(s): 9a56158

model trainig added

Browse files
config/config.yaml CHANGED
@@ -16,5 +16,11 @@ train_tokenize:
16
  input_file_dir : artifacts/ban_tokenization/combined_text.txt
17
  save_file : artifacts/train_tokenization
18
 
 
 
 
 
 
 
19
 
20
 
 
16
  input_file_dir : artifacts/ban_tokenization/combined_text.txt
17
  save_file : artifacts/train_tokenization
18
 
19
+ model_training:
20
+ root_dir : artifacts/model_training
21
+ data_dir : artifacts/data_ingestion/BanSum.csv
22
+ ben_tokenizer_dir : artifacts/train_tokenization/cbengali_tokenizer.model
23
+ save_trained_model_dir : artifacts/model_training
24
+
25
 
26
 
main.py CHANGED
@@ -2,6 +2,9 @@ from src.benglasummarization.logging import logger
2
  from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
3
  from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
4
  from src.benglasummarization.pipeline.stage_03_train_ban_token import TrainTokenizePipeLine
 
 
 
5
  STAGE_NAME = 'Data Ingestion Stage'
6
 
7
  try:
@@ -35,3 +38,12 @@ except Exception as e:
35
  logger.exception(e)
36
  raise e
37
 
 
 
 
 
 
 
 
 
 
 
2
  from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
3
  from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
4
  from src.benglasummarization.pipeline.stage_03_train_ban_token import TrainTokenizePipeLine
5
+ from src.benglasummarization.pipeline.stage_04_model_Training import ModelTrainingPipeline
6
+
7
+
8
  STAGE_NAME = 'Data Ingestion Stage'
9
 
10
  try:
 
38
  logger.exception(e)
39
  raise e
40
 
41
+ STAGE_NAME = 'Model Training PipeLine Stage'
42
+ try:
43
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
44
+ train_model = ModelTrainingPipeline()
45
+ train_model.main()
46
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
47
+ except Exception as e:
48
+ logger.exception(e)
49
+ raise e
params.yaml CHANGED
@@ -4,4 +4,16 @@ pre_tokenize:
4
  train_tokenize:
5
  model_prefix : 'cbengali_tokenizer'
6
  model_type : 'unigram'
7
- vocab_size : 91902
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  train_tokenize:
5
  model_prefix : 'cbengali_tokenizer'
6
  model_type : 'unigram'
7
+ vocab_size : 91902
8
+
9
+ training_model:
10
+ max_input_length : 256
11
+ max_output_length : 125
12
+ model_name : 'google/pegasus-large'
13
+ batch_size : 1
14
+ num_epochs : 1
15
+ learning_rate : 1e-4
16
+ accumulator_steps : 4
17
+ max_grad_norm : 1.0
18
+ early_stopping_patience : 3
19
+ patience_counter : 0
research/model_training.ipynb ADDED
File without changes
src/benglasummarization/components/model_training.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset
2
+ from transformers import PegasusTokenizer
3
+ import os
4
+ import torch
5
+ from torch.utils.data import DataLoader, random_split
6
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
7
+ from tqdm import tqdm
8
+ import pandas as pd
9
+ from sklearn.model_selection import train_test_split
10
+ from src.benglasummarization.logging import logger
11
+ from src.benglasummarization.entity.config_entity import ModelTrainingConfig
12
+
13
+
14
+ class BengaliSummaryDataset(Dataset):
15
+ def __init__(self, texts, summaries, tokenizer: PegasusTokenizer, config: ModelTrainingConfig):
16
+ self.config = config
17
+ self.texts = texts
18
+ self.summaries = summaries
19
+ self.tokenizer = tokenizer
20
+
21
+ def __len__(self):
22
+ return len(self.texts)
23
+
24
+ def __getitem__(self, idx):
25
+ text = self.texts[idx]
26
+ summary = self.summaries[idx]
27
+
28
+ inputs = self.tokenizer(
29
+ text,
30
+ truncation=True,
31
+ padding="max_length",
32
+ max_length=self.config.max_input_length,
33
+ return_tensors="pt"
34
+ )
35
+ labels = self.tokenizer(
36
+ summary,
37
+ truncation=True,
38
+ padding="max_length",
39
+ max_length=self.config.max_output_length,
40
+ return_tensors="pt"
41
+ )
42
+
43
+ input_ids = inputs['input_ids'].squeeze()
44
+ attention_mask = inputs['attention_mask'].squeeze()
45
+ labels = labels['input_ids'].squeeze()
46
+
47
+ # Replace padding token id's with -100 to ignore them during loss computation
48
+ labels[labels == self.tokenizer.pad_token_id] = -100
49
+
50
+ return {
51
+ "input_ids": input_ids,
52
+ "attention_mask": attention_mask,
53
+ "labels": labels
54
+ }
55
+
56
+
57
+ class ModelTraining:
58
+ def __init__(self, config: ModelTrainingConfig):
59
+ self.config = config
60
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
61
+
62
+ def load_data(self):
63
+ df = pd.read_csv(self.config.data_dir)
64
+ df = df.head(1000)
65
+ texts = df['main'].tolist()
66
+ summaries = df['sum3'].tolist()
67
+ return train_test_split(texts, summaries, test_size=0.1, random_state=42)
68
+
69
+ def create_datasets(self, train_texts, train_summaries, val_texts, val_summaries):
70
+ tokenizer = PegasusTokenizer.from_pretrained(self.config.ben_tokenizer_dir)
71
+ train_dataset = BengaliSummaryDataset(train_texts, train_summaries, tokenizer, self.config)
72
+ val_dataset = BengaliSummaryDataset(val_texts, val_summaries, tokenizer, self.config)
73
+ return train_dataset, val_dataset, tokenizer
74
+
75
+ def train(self):
76
+ # Load and split data
77
+ train_texts, val_texts, train_summaries, val_summaries = self.load_data()
78
+
79
+ # Create datasets and tokenizer
80
+ train_dataset, val_dataset, tokenizer = self.create_datasets(train_texts, train_summaries, val_texts, val_summaries)
81
+
82
+ # Create data loaders
83
+ train_dataloader = DataLoader(train_dataset, batch_size=self.config.batch_size, shuffle=True)
84
+ val_dataloader = DataLoader(val_dataset, batch_size=self.config.batch_size)
85
+
86
+ # Initialize model
87
+ model = PegasusForConditionalGeneration.from_pretrained(self.config.model_name).to(self.device)
88
+
89
+ # Optimizer and scheduler
90
+ optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
91
+ scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.5, total_iters=len(train_dataloader) * self.config.num_epochs)
92
+
93
+ # Training loop
94
+ best_val_loss = float('inf')
95
+ for epoch in range(self.config.num_epochs):
96
+ model.train()
97
+ total_loss = 0
98
+ progress_bar = tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}")
99
+
100
+ for step, batch in enumerate(train_dataloader):
101
+ input_ids = batch['input_ids'].to(self.device)
102
+ attention_mask = batch['attention_mask'].to(self.device)
103
+ labels = batch['labels'].to(self.device)
104
+
105
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
106
+ loss = outputs.loss
107
+ loss = loss / self.config.accumulator_steps
108
+ loss.backward()
109
+
110
+ total_loss += loss.item()
111
+
112
+ if (step + 1) % self.config.accumulator_steps == 0 or step == len(train_dataloader) - 1:
113
+ torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.max_grad_norm)
114
+ optimizer.step()
115
+ scheduler.step()
116
+ optimizer.zero_grad()
117
+
118
+ progress_bar.update(1)
119
+ progress_bar.set_postfix({'loss': total_loss / (step + 1)})
120
+
121
+ progress_bar.close()
122
+
123
+ # Validation
124
+ model.eval()
125
+ val_loss = 0
126
+ with torch.no_grad():
127
+ for batch in tqdm(val_dataloader, desc="Validation"):
128
+ input_ids = batch['input_ids'].to(self.device)
129
+ attention_mask = batch['attention_mask'].to(self.device)
130
+ labels = batch['labels'].to(self.device)
131
+
132
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
133
+ val_loss += outputs.loss.item()
134
+
135
+ val_loss /= len(val_dataloader)
136
+ print(f"Epoch {epoch + 1} - Validation Loss: {val_loss:.4f}")
137
+
138
+
139
+
140
+ logger.info(f"Training Completed")
141
+ save_path = os.path.join(self.config.save_trained_model_dir)
142
+ model.save_pretrained(save_path)
143
+ tokenizer.save_pretrained(save_path)
144
+ logger.info(f'Model Saved to {self.config.save_trained_model_dir}')
145
+
src/benglasummarization/config/configuration.py CHANGED
@@ -2,7 +2,7 @@ from src.benglasummarization.constants import *
2
  from src.benglasummarization.utils.common import read_yaml, create_directories
3
  from benglasummarization.entity.config_entity import DataIngestionConfig
4
  from src.benglasummarization.entity.config_entity import BanTokenizationConfig
5
- from src.benglasummarization.entity.config_entity import BanTokenTrainConfig
6
  class ConfigurationManager:
7
  def __init__(
8
  self,
@@ -56,4 +56,27 @@ class ConfigurationManager:
56
  model_type= params.model_type,
57
  vocab_size= params.vocab_size
58
  )
59
- return train_token_config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from src.benglasummarization.utils.common import read_yaml, create_directories
3
  from benglasummarization.entity.config_entity import DataIngestionConfig
4
  from src.benglasummarization.entity.config_entity import BanTokenizationConfig
5
+ from src.benglasummarization.entity.config_entity import BanTokenTrainConfig, ModelTrainingConfig
6
  class ConfigurationManager:
7
  def __init__(
8
  self,
 
56
  model_type= params.model_type,
57
  vocab_size= params.vocab_size
58
  )
59
+ return train_token_config
60
+
61
+ def get_model_trainer_config(self) -> ModelTrainingConfig:
62
+ config = self.config.model_training
63
+ param = self.params.training_model
64
+ create_directories([config.root_dir])
65
+ model_trainer_config = ModelTrainingConfig(
66
+ root_dir= config.root_dir,
67
+ data_dir= config.data_dir,
68
+ ben_tokenizer_dir= config.ben_tokenizer_dir,
69
+ save_trained_model_dir= config.save_trained_model_dir,
70
+ max_input_length = param.max_input_length,
71
+ max_output_length = param.max_output_length,
72
+ batch_size = param.batch_size,
73
+ num_epochs = param.num_epochs,
74
+ accumulator_steps = param.accumulator_steps,
75
+ max_grad_norm = param.max_grad_norm,
76
+ early_stopping_patience = param.early_stopping_patience,
77
+ patience_counter = param.patience_counter,
78
+ model_name = param.model_name,
79
+ learning_rate = param.learning_rate
80
+
81
+ )
82
+ return model_trainer_config
src/benglasummarization/entity/config_entity.py CHANGED
@@ -24,4 +24,21 @@ class BanTokenTrainConfig:
24
  model_prefix : str
25
  model_type : str
26
  vocab_size : int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
 
24
  model_prefix : str
25
  model_type : str
26
  vocab_size : int
27
+
28
+ @dataclass(frozen=True)
29
+ class ModelTrainingConfig:
30
+ root_dir : Path
31
+ data_dir : Path
32
+ ben_tokenizer_dir : Path
33
+ save_trained_model_dir : Path
34
+ max_input_length : int
35
+ max_output_length : int
36
+ batch_size : int
37
+ num_epochs : int
38
+ accumulator_steps : int
39
+ max_grad_norm : float
40
+ early_stopping_patience : int
41
+ patience_counter : int
42
+ model_name : str
43
+ learning_rate : float
44
 
src/benglasummarization/pipeline/stage_04_model_Training.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.benglasummarization.components.model_training import ModelTraining
2
+ from src.benglasummarization.config.configuration import ConfigurationManager
3
+
4
+ class ModelTrainingPipeline:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def main(self):
9
+ config_manager = ConfigurationManager()
10
+ model_training_config = config_manager.get_model_trainer_config()
11
+ model_trainer = ModelTraining(config=model_training_config)
12
+ model_trainer.train()