logicsame commited on
Commit
00c0948
·
1 Parent(s): 82c8d9a

dvc update

Browse files
dvc.lock ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ schema: '2.0'
2
+ stages:
3
+ data_ingestion:
4
+ cmd: python src/benglasummarization/pipeline/stage01_data_ingestion.py
5
+ deps:
6
+ - path: config/config.yaml
7
+ hash: md5
8
+ md5: 7dd47470935d9dbcbc4f22c08179c358
9
+ size: 810
10
+ - path: src/benglasummarization/pipeline/stage01_data_ingestion.py
11
+ hash: md5
12
+ md5: 7c1a49bcb041ba18e3ebafe7b0995470
13
+ size: 499
14
+ prepare_Ban_tok:
15
+ cmd: python src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
16
+ deps:
17
+ - path: config/config.yaml
18
+ hash: md5
19
+ md5: 7dd47470935d9dbcbc4f22c08179c358
20
+ size: 810
21
+ - path: src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
22
+ hash: md5
23
+ md5: 111dd3fd6adf995de51fea3a2a171e9e
24
+ size: 490
25
+ params:
26
+ params.yaml:
27
+ output_file: combined_text.txt
dvc.yaml CHANGED
@@ -4,18 +4,16 @@ stages:
4
  deps:
5
  - src/benglasummarization/pipeline/stage01_data_ingestion.py
6
  - config/config.yaml
7
- outs:
8
- - artifacts/data_ingestion
9
 
10
- prepare_base_model:
11
  cmd: python src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
12
  deps:
13
- - ssrc/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
14
  - config/config.yaml
15
  params:
16
  - output_file
17
- outs:
18
- - artifacts/ban_tokenization
19
 
20
  tokenize_training:
21
  cmd: python src/benglasummarization/pipeline/stage_03_train_ban_token.py
@@ -23,7 +21,6 @@ stages:
23
  - src/benglasummarization/pipeline/stage_03_train_ban_token.py
24
  - config/config.yaml
25
  - artifacts/ban_tokenization/combined_text.txt
26
- - artifacts/train_tokenization
27
  params:
28
  - model_prefix # List format for params
29
  - model_type
@@ -47,5 +44,4 @@ stages:
47
  - max_grad_norm
48
  - early_stopping_patience
49
  - patience_counter
50
- outs:
51
- - artifacts/model_training
 
4
  deps:
5
  - src/benglasummarization/pipeline/stage01_data_ingestion.py
6
  - config/config.yaml
7
+
 
8
 
9
+ prepare_Ban_tok:
10
  cmd: python src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
11
  deps:
12
+ - src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
13
  - config/config.yaml
14
  params:
15
  - output_file
16
+
 
17
 
18
  tokenize_training:
19
  cmd: python src/benglasummarization/pipeline/stage_03_train_ban_token.py
 
21
  - src/benglasummarization/pipeline/stage_03_train_ban_token.py
22
  - config/config.yaml
23
  - artifacts/ban_tokenization/combined_text.txt
 
24
  params:
25
  - model_prefix # List format for params
26
  - model_type
 
44
  - max_grad_norm
45
  - early_stopping_patience
46
  - patience_counter
47
+
 
params.yaml CHANGED
@@ -1,19 +1,19 @@
1
- pre_tokenize:
2
- output_file: "combined_text.txt"
3
 
4
- train_tokenize:
5
- model_prefix : 'cbengali_tokenizer'
6
- model_type : 'unigram'
7
- vocab_size : 91902
8
 
9
- training_model:
10
- max_input_length : 256
11
- max_output_length : 125
12
- model_name : 'google/pegasus-large'
13
- batch_size : 1
14
- num_epochs : 1
15
- learning_rate : 1e-4
16
- accumulator_steps : 4
17
- max_grad_norm : 1.0
18
- early_stopping_patience : 3
19
- patience_counter : 0
 
 
 
 
 
 
 
 
1
 
2
+ output_file: "combined_text.txt"
 
 
 
3
 
4
+
5
+ model_prefix : 'cbengali_tokenizer'
6
+ model_type : 'unigram'
7
+ vocab_size : 91902
8
+
9
+
10
+ max_input_length : 256
11
+ max_output_length : 125
12
+ model_name : 'google/pegasus-large'
13
+ batch_size : 1
14
+ num_epochs : 1
15
+ learning_rate : 1e-4
16
+ accumulator_steps : 4
17
+ max_grad_norm : 1.0
18
+ early_stopping_patience : 3
19
+ patience_counter : 0
src/benglasummarization/components/prepare_ben_token.py CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
2
  from pathlib import Path
3
  from benglasummarization.logging import logger
4
  from tqdm.notebook import tqdm
5
- from src.benglasummarization.entity.config_entity import BanTokenizationConfig
6
  class BanTokenization:
7
  def __init__(self, config: BanTokenizationConfig):
8
  self.config = config
 
2
  from pathlib import Path
3
  from benglasummarization.logging import logger
4
  from tqdm.notebook import tqdm
5
+ from benglasummarization.entity.config_entity import BanTokenizationConfig
6
  class BanTokenization:
7
  def __init__(self, config: BanTokenizationConfig):
8
  self.config = config
src/benglasummarization/config/configuration.py CHANGED
@@ -30,7 +30,7 @@ class ConfigurationManager:
30
 
31
  def get_ben_tokenization_config(self) -> BanTokenizationConfig:
32
  config = self.config.ban_tokenization
33
- params = self.params.pre_tokenize
34
  create_directories([config.root_dir])
35
 
36
  ben_tokenization_config = BanTokenizationConfig(
@@ -45,7 +45,7 @@ class ConfigurationManager:
45
 
46
  def get_train_token_config(self) -> BanTokenTrainConfig:
47
  config = self.config.train_tokenize
48
- params = self.params.train_tokenize
49
  create_directories([config.root_dir])
50
 
51
  train_token_config = BanTokenTrainConfig(
@@ -60,7 +60,7 @@ class ConfigurationManager:
60
 
61
  def get_model_trainer_config(self) -> ModelTrainingConfig:
62
  config = self.config.model_training
63
- param = self.params.training_model
64
  create_directories([config.root_dir])
65
  model_trainer_config = ModelTrainingConfig(
66
  root_dir= config.root_dir,
 
30
 
31
  def get_ben_tokenization_config(self) -> BanTokenizationConfig:
32
  config = self.config.ban_tokenization
33
+ params = self.params
34
  create_directories([config.root_dir])
35
 
36
  ben_tokenization_config = BanTokenizationConfig(
 
45
 
46
  def get_train_token_config(self) -> BanTokenTrainConfig:
47
  config = self.config.train_tokenize
48
+ params = self.params
49
  create_directories([config.root_dir])
50
 
51
  train_token_config = BanTokenTrainConfig(
 
60
 
61
  def get_model_trainer_config(self) -> ModelTrainingConfig:
62
  config = self.config.model_training
63
+ param = self.params
64
  create_directories([config.root_dir])
65
  model_trainer_config = ModelTrainingConfig(
66
  root_dir= config.root_dir,