Spaces:
Sleeping
Sleeping
logicsame
commited on
Commit
·
a76bfd1
1
Parent(s):
9a56158
model trainig added
Browse files- config/config.yaml +6 -0
- main.py +12 -0
- params.yaml +13 -1
- research/model_training.ipynb +0 -0
- src/benglasummarization/components/model_training.py +145 -0
- src/benglasummarization/config/configuration.py +25 -2
- src/benglasummarization/entity/config_entity.py +17 -0
- src/benglasummarization/pipeline/stage_04_model_Training.py +12 -0
config/config.yaml
CHANGED
@@ -16,5 +16,11 @@ train_tokenize:
|
|
16 |
input_file_dir : artifacts/ban_tokenization/combined_text.txt
|
17 |
save_file : artifacts/train_tokenization
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
|
|
16 |
input_file_dir : artifacts/ban_tokenization/combined_text.txt
|
17 |
save_file : artifacts/train_tokenization
|
18 |
|
19 |
+
model_training:
|
20 |
+
root_dir : artifacts/model_training
|
21 |
+
data_dir : artifacts/data_ingestion/BanSum.csv
|
22 |
+
ben_tokenizer_dir : artifacts/train_tokenization/cbengali_tokenizer.model
|
23 |
+
save_trained_model_dir : artifacts/model_training
|
24 |
+
|
25 |
|
26 |
|
main.py
CHANGED
@@ -2,6 +2,9 @@ from src.benglasummarization.logging import logger
|
|
2 |
from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
|
3 |
from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
|
4 |
from src.benglasummarization.pipeline.stage_03_train_ban_token import TrainTokenizePipeLine
|
|
|
|
|
|
|
5 |
STAGE_NAME = 'Data Ingestion Stage'
|
6 |
|
7 |
try:
|
@@ -35,3 +38,12 @@ except Exception as e:
|
|
35 |
logger.exception(e)
|
36 |
raise e
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
|
3 |
from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
|
4 |
from src.benglasummarization.pipeline.stage_03_train_ban_token import TrainTokenizePipeLine
|
5 |
+
from src.benglasummarization.pipeline.stage_04_model_Training import ModelTrainingPipeline
|
6 |
+
|
7 |
+
|
8 |
STAGE_NAME = 'Data Ingestion Stage'
|
9 |
|
10 |
try:
|
|
|
38 |
logger.exception(e)
|
39 |
raise e
|
40 |
|
41 |
+
STAGE_NAME = 'Model Training PipeLine Stage'
|
42 |
+
try:
|
43 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
44 |
+
train_model = ModelTrainingPipeline()
|
45 |
+
train_model.main()
|
46 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
47 |
+
except Exception as e:
|
48 |
+
logger.exception(e)
|
49 |
+
raise e
|
params.yaml
CHANGED
@@ -4,4 +4,16 @@ pre_tokenize:
|
|
4 |
train_tokenize:
|
5 |
model_prefix : 'cbengali_tokenizer'
|
6 |
model_type : 'unigram'
|
7 |
-
vocab_size : 91902
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
train_tokenize:
|
5 |
model_prefix : 'cbengali_tokenizer'
|
6 |
model_type : 'unigram'
|
7 |
+
vocab_size : 91902
|
8 |
+
|
9 |
+
training_model:
|
10 |
+
max_input_length : 256
|
11 |
+
max_output_length : 125
|
12 |
+
model_name : 'google/pegasus-large'
|
13 |
+
batch_size : 1
|
14 |
+
num_epochs : 1
|
15 |
+
learning_rate : 1e-4
|
16 |
+
accumulator_steps : 4
|
17 |
+
max_grad_norm : 1.0
|
18 |
+
early_stopping_patience : 3
|
19 |
+
patience_counter : 0
|
research/model_training.ipynb
ADDED
File without changes
|
src/benglasummarization/components/model_training.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data import Dataset
|
2 |
+
from transformers import PegasusTokenizer
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
from torch.utils.data import DataLoader, random_split
|
6 |
+
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
|
7 |
+
from tqdm import tqdm
|
8 |
+
import pandas as pd
|
9 |
+
from sklearn.model_selection import train_test_split
|
10 |
+
from src.benglasummarization.logging import logger
|
11 |
+
from src.benglasummarization.entity.config_entity import ModelTrainingConfig
|
12 |
+
|
13 |
+
|
14 |
+
class BengaliSummaryDataset(Dataset):
|
15 |
+
def __init__(self, texts, summaries, tokenizer: PegasusTokenizer, config: ModelTrainingConfig):
|
16 |
+
self.config = config
|
17 |
+
self.texts = texts
|
18 |
+
self.summaries = summaries
|
19 |
+
self.tokenizer = tokenizer
|
20 |
+
|
21 |
+
def __len__(self):
|
22 |
+
return len(self.texts)
|
23 |
+
|
24 |
+
def __getitem__(self, idx):
|
25 |
+
text = self.texts[idx]
|
26 |
+
summary = self.summaries[idx]
|
27 |
+
|
28 |
+
inputs = self.tokenizer(
|
29 |
+
text,
|
30 |
+
truncation=True,
|
31 |
+
padding="max_length",
|
32 |
+
max_length=self.config.max_input_length,
|
33 |
+
return_tensors="pt"
|
34 |
+
)
|
35 |
+
labels = self.tokenizer(
|
36 |
+
summary,
|
37 |
+
truncation=True,
|
38 |
+
padding="max_length",
|
39 |
+
max_length=self.config.max_output_length,
|
40 |
+
return_tensors="pt"
|
41 |
+
)
|
42 |
+
|
43 |
+
input_ids = inputs['input_ids'].squeeze()
|
44 |
+
attention_mask = inputs['attention_mask'].squeeze()
|
45 |
+
labels = labels['input_ids'].squeeze()
|
46 |
+
|
47 |
+
# Replace padding token id's with -100 to ignore them during loss computation
|
48 |
+
labels[labels == self.tokenizer.pad_token_id] = -100
|
49 |
+
|
50 |
+
return {
|
51 |
+
"input_ids": input_ids,
|
52 |
+
"attention_mask": attention_mask,
|
53 |
+
"labels": labels
|
54 |
+
}
|
55 |
+
|
56 |
+
|
57 |
+
class ModelTraining:
|
58 |
+
def __init__(self, config: ModelTrainingConfig):
|
59 |
+
self.config = config
|
60 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
61 |
+
|
62 |
+
def load_data(self):
|
63 |
+
df = pd.read_csv(self.config.data_dir)
|
64 |
+
df = df.head(1000)
|
65 |
+
texts = df['main'].tolist()
|
66 |
+
summaries = df['sum3'].tolist()
|
67 |
+
return train_test_split(texts, summaries, test_size=0.1, random_state=42)
|
68 |
+
|
69 |
+
def create_datasets(self, train_texts, train_summaries, val_texts, val_summaries):
|
70 |
+
tokenizer = PegasusTokenizer.from_pretrained(self.config.ben_tokenizer_dir)
|
71 |
+
train_dataset = BengaliSummaryDataset(train_texts, train_summaries, tokenizer, self.config)
|
72 |
+
val_dataset = BengaliSummaryDataset(val_texts, val_summaries, tokenizer, self.config)
|
73 |
+
return train_dataset, val_dataset, tokenizer
|
74 |
+
|
75 |
+
def train(self):
|
76 |
+
# Load and split data
|
77 |
+
train_texts, val_texts, train_summaries, val_summaries = self.load_data()
|
78 |
+
|
79 |
+
# Create datasets and tokenizer
|
80 |
+
train_dataset, val_dataset, tokenizer = self.create_datasets(train_texts, train_summaries, val_texts, val_summaries)
|
81 |
+
|
82 |
+
# Create data loaders
|
83 |
+
train_dataloader = DataLoader(train_dataset, batch_size=self.config.batch_size, shuffle=True)
|
84 |
+
val_dataloader = DataLoader(val_dataset, batch_size=self.config.batch_size)
|
85 |
+
|
86 |
+
# Initialize model
|
87 |
+
model = PegasusForConditionalGeneration.from_pretrained(self.config.model_name).to(self.device)
|
88 |
+
|
89 |
+
# Optimizer and scheduler
|
90 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
91 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.5, total_iters=len(train_dataloader) * self.config.num_epochs)
|
92 |
+
|
93 |
+
# Training loop
|
94 |
+
best_val_loss = float('inf')
|
95 |
+
for epoch in range(self.config.num_epochs):
|
96 |
+
model.train()
|
97 |
+
total_loss = 0
|
98 |
+
progress_bar = tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}")
|
99 |
+
|
100 |
+
for step, batch in enumerate(train_dataloader):
|
101 |
+
input_ids = batch['input_ids'].to(self.device)
|
102 |
+
attention_mask = batch['attention_mask'].to(self.device)
|
103 |
+
labels = batch['labels'].to(self.device)
|
104 |
+
|
105 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
|
106 |
+
loss = outputs.loss
|
107 |
+
loss = loss / self.config.accumulator_steps
|
108 |
+
loss.backward()
|
109 |
+
|
110 |
+
total_loss += loss.item()
|
111 |
+
|
112 |
+
if (step + 1) % self.config.accumulator_steps == 0 or step == len(train_dataloader) - 1:
|
113 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.max_grad_norm)
|
114 |
+
optimizer.step()
|
115 |
+
scheduler.step()
|
116 |
+
optimizer.zero_grad()
|
117 |
+
|
118 |
+
progress_bar.update(1)
|
119 |
+
progress_bar.set_postfix({'loss': total_loss / (step + 1)})
|
120 |
+
|
121 |
+
progress_bar.close()
|
122 |
+
|
123 |
+
# Validation
|
124 |
+
model.eval()
|
125 |
+
val_loss = 0
|
126 |
+
with torch.no_grad():
|
127 |
+
for batch in tqdm(val_dataloader, desc="Validation"):
|
128 |
+
input_ids = batch['input_ids'].to(self.device)
|
129 |
+
attention_mask = batch['attention_mask'].to(self.device)
|
130 |
+
labels = batch['labels'].to(self.device)
|
131 |
+
|
132 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
|
133 |
+
val_loss += outputs.loss.item()
|
134 |
+
|
135 |
+
val_loss /= len(val_dataloader)
|
136 |
+
print(f"Epoch {epoch + 1} - Validation Loss: {val_loss:.4f}")
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
logger.info(f"Training Completed")
|
141 |
+
save_path = os.path.join(self.config.save_trained_model_dir)
|
142 |
+
model.save_pretrained(save_path)
|
143 |
+
tokenizer.save_pretrained(save_path)
|
144 |
+
logger.info(f'Model Saved to {self.config.save_trained_model_dir}')
|
145 |
+
|
src/benglasummarization/config/configuration.py
CHANGED
@@ -2,7 +2,7 @@ from src.benglasummarization.constants import *
|
|
2 |
from src.benglasummarization.utils.common import read_yaml, create_directories
|
3 |
from benglasummarization.entity.config_entity import DataIngestionConfig
|
4 |
from src.benglasummarization.entity.config_entity import BanTokenizationConfig
|
5 |
-
from src.benglasummarization.entity.config_entity import BanTokenTrainConfig
|
6 |
class ConfigurationManager:
|
7 |
def __init__(
|
8 |
self,
|
@@ -56,4 +56,27 @@ class ConfigurationManager:
|
|
56 |
model_type= params.model_type,
|
57 |
vocab_size= params.vocab_size
|
58 |
)
|
59 |
-
return train_token_config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from src.benglasummarization.utils.common import read_yaml, create_directories
|
3 |
from benglasummarization.entity.config_entity import DataIngestionConfig
|
4 |
from src.benglasummarization.entity.config_entity import BanTokenizationConfig
|
5 |
+
from src.benglasummarization.entity.config_entity import BanTokenTrainConfig, ModelTrainingConfig
|
6 |
class ConfigurationManager:
|
7 |
def __init__(
|
8 |
self,
|
|
|
56 |
model_type= params.model_type,
|
57 |
vocab_size= params.vocab_size
|
58 |
)
|
59 |
+
return train_token_config
|
60 |
+
|
61 |
+
def get_model_trainer_config(self) -> ModelTrainingConfig:
|
62 |
+
config = self.config.model_training
|
63 |
+
param = self.params.training_model
|
64 |
+
create_directories([config.root_dir])
|
65 |
+
model_trainer_config = ModelTrainingConfig(
|
66 |
+
root_dir= config.root_dir,
|
67 |
+
data_dir= config.data_dir,
|
68 |
+
ben_tokenizer_dir= config.ben_tokenizer_dir,
|
69 |
+
save_trained_model_dir= config.save_trained_model_dir,
|
70 |
+
max_input_length = param.max_input_length,
|
71 |
+
max_output_length = param.max_output_length,
|
72 |
+
batch_size = param.batch_size,
|
73 |
+
num_epochs = param.num_epochs,
|
74 |
+
accumulator_steps = param.accumulator_steps,
|
75 |
+
max_grad_norm = param.max_grad_norm,
|
76 |
+
early_stopping_patience = param.early_stopping_patience,
|
77 |
+
patience_counter = param.patience_counter,
|
78 |
+
model_name = param.model_name,
|
79 |
+
learning_rate = param.learning_rate
|
80 |
+
|
81 |
+
)
|
82 |
+
return model_trainer_config
|
src/benglasummarization/entity/config_entity.py
CHANGED
@@ -24,4 +24,21 @@ class BanTokenTrainConfig:
|
|
24 |
model_prefix : str
|
25 |
model_type : str
|
26 |
vocab_size : int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
24 |
model_prefix : str
|
25 |
model_type : str
|
26 |
vocab_size : int
|
27 |
+
|
28 |
+
@dataclass(frozen=True)
|
29 |
+
class ModelTrainingConfig:
|
30 |
+
root_dir : Path
|
31 |
+
data_dir : Path
|
32 |
+
ben_tokenizer_dir : Path
|
33 |
+
save_trained_model_dir : Path
|
34 |
+
max_input_length : int
|
35 |
+
max_output_length : int
|
36 |
+
batch_size : int
|
37 |
+
num_epochs : int
|
38 |
+
accumulator_steps : int
|
39 |
+
max_grad_norm : float
|
40 |
+
early_stopping_patience : int
|
41 |
+
patience_counter : int
|
42 |
+
model_name : str
|
43 |
+
learning_rate : float
|
44 |
|
src/benglasummarization/pipeline/stage_04_model_Training.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.benglasummarization.components.model_training import ModelTraining
|
2 |
+
from src.benglasummarization.config.configuration import ConfigurationManager
|
3 |
+
|
4 |
+
class ModelTrainingPipeline:
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
def main(self):
|
9 |
+
config_manager = ConfigurationManager()
|
10 |
+
model_training_config = config_manager.get_model_trainer_config()
|
11 |
+
model_trainer = ModelTraining(config=model_training_config)
|
12 |
+
model_trainer.train()
|