Spaces:
Sleeping
Sleeping
logicsame
commited on
Commit
·
00c0948
1
Parent(s):
82c8d9a
dvc update
Browse files- dvc.lock +27 -0
- dvc.yaml +5 -9
- params.yaml +17 -17
- src/benglasummarization/components/prepare_ben_token.py +1 -1
- src/benglasummarization/config/configuration.py +3 -3
dvc.lock
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
schema: '2.0'
|
2 |
+
stages:
|
3 |
+
data_ingestion:
|
4 |
+
cmd: python src/benglasummarization/pipeline/stage01_data_ingestion.py
|
5 |
+
deps:
|
6 |
+
- path: config/config.yaml
|
7 |
+
hash: md5
|
8 |
+
md5: 7dd47470935d9dbcbc4f22c08179c358
|
9 |
+
size: 810
|
10 |
+
- path: src/benglasummarization/pipeline/stage01_data_ingestion.py
|
11 |
+
hash: md5
|
12 |
+
md5: 7c1a49bcb041ba18e3ebafe7b0995470
|
13 |
+
size: 499
|
14 |
+
prepare_Ban_tok:
|
15 |
+
cmd: python src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
|
16 |
+
deps:
|
17 |
+
- path: config/config.yaml
|
18 |
+
hash: md5
|
19 |
+
md5: 7dd47470935d9dbcbc4f22c08179c358
|
20 |
+
size: 810
|
21 |
+
- path: src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
|
22 |
+
hash: md5
|
23 |
+
md5: 111dd3fd6adf995de51fea3a2a171e9e
|
24 |
+
size: 490
|
25 |
+
params:
|
26 |
+
params.yaml:
|
27 |
+
output_file: combined_text.txt
|
dvc.yaml
CHANGED
@@ -4,18 +4,16 @@ stages:
|
|
4 |
deps:
|
5 |
- src/benglasummarization/pipeline/stage01_data_ingestion.py
|
6 |
- config/config.yaml
|
7 |
-
|
8 |
-
- artifacts/data_ingestion
|
9 |
|
10 |
-
|
11 |
cmd: python src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
|
12 |
deps:
|
13 |
-
-
|
14 |
- config/config.yaml
|
15 |
params:
|
16 |
- output_file
|
17 |
-
|
18 |
-
- artifacts/ban_tokenization
|
19 |
|
20 |
tokenize_training:
|
21 |
cmd: python src/benglasummarization/pipeline/stage_03_train_ban_token.py
|
@@ -23,7 +21,6 @@ stages:
|
|
23 |
- src/benglasummarization/pipeline/stage_03_train_ban_token.py
|
24 |
- config/config.yaml
|
25 |
- artifacts/ban_tokenization/combined_text.txt
|
26 |
-
- artifacts/train_tokenization
|
27 |
params:
|
28 |
- model_prefix # List format for params
|
29 |
- model_type
|
@@ -47,5 +44,4 @@ stages:
|
|
47 |
- max_grad_norm
|
48 |
- early_stopping_patience
|
49 |
- patience_counter
|
50 |
-
|
51 |
-
- artifacts/model_training
|
|
|
4 |
deps:
|
5 |
- src/benglasummarization/pipeline/stage01_data_ingestion.py
|
6 |
- config/config.yaml
|
7 |
+
|
|
|
8 |
|
9 |
+
prepare_Ban_tok:
|
10 |
cmd: python src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
|
11 |
deps:
|
12 |
+
- src/benglasummarization/pipeline/stage_02_prepare_ben_tok.py
|
13 |
- config/config.yaml
|
14 |
params:
|
15 |
- output_file
|
16 |
+
|
|
|
17 |
|
18 |
tokenize_training:
|
19 |
cmd: python src/benglasummarization/pipeline/stage_03_train_ban_token.py
|
|
|
21 |
- src/benglasummarization/pipeline/stage_03_train_ban_token.py
|
22 |
- config/config.yaml
|
23 |
- artifacts/ban_tokenization/combined_text.txt
|
|
|
24 |
params:
|
25 |
- model_prefix # List format for params
|
26 |
- model_type
|
|
|
44 |
- max_grad_norm
|
45 |
- early_stopping_patience
|
46 |
- patience_counter
|
47 |
+
|
|
params.yaml
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
-
pre_tokenize:
|
2 |
-
output_file: "combined_text.txt"
|
3 |
|
4 |
-
|
5 |
-
model_prefix : 'cbengali_tokenizer'
|
6 |
-
model_type : 'unigram'
|
7 |
-
vocab_size : 91902
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
+
output_file: "combined_text.txt"
|
|
|
|
|
|
|
3 |
|
4 |
+
|
5 |
+
model_prefix : 'cbengali_tokenizer'
|
6 |
+
model_type : 'unigram'
|
7 |
+
vocab_size : 91902
|
8 |
+
|
9 |
+
|
10 |
+
max_input_length : 256
|
11 |
+
max_output_length : 125
|
12 |
+
model_name : 'google/pegasus-large'
|
13 |
+
batch_size : 1
|
14 |
+
num_epochs : 1
|
15 |
+
learning_rate : 1e-4
|
16 |
+
accumulator_steps : 4
|
17 |
+
max_grad_norm : 1.0
|
18 |
+
early_stopping_patience : 3
|
19 |
+
patience_counter : 0
|
src/benglasummarization/components/prepare_ben_token.py
CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
|
|
2 |
from pathlib import Path
|
3 |
from benglasummarization.logging import logger
|
4 |
from tqdm.notebook import tqdm
|
5 |
-
from
|
6 |
class BanTokenization:
|
7 |
def __init__(self, config: BanTokenizationConfig):
|
8 |
self.config = config
|
|
|
2 |
from pathlib import Path
|
3 |
from benglasummarization.logging import logger
|
4 |
from tqdm.notebook import tqdm
|
5 |
+
from benglasummarization.entity.config_entity import BanTokenizationConfig
|
6 |
class BanTokenization:
|
7 |
def __init__(self, config: BanTokenizationConfig):
|
8 |
self.config = config
|
src/benglasummarization/config/configuration.py
CHANGED
@@ -30,7 +30,7 @@ class ConfigurationManager:
|
|
30 |
|
31 |
def get_ben_tokenization_config(self) -> BanTokenizationConfig:
|
32 |
config = self.config.ban_tokenization
|
33 |
-
params = self.params
|
34 |
create_directories([config.root_dir])
|
35 |
|
36 |
ben_tokenization_config = BanTokenizationConfig(
|
@@ -45,7 +45,7 @@ class ConfigurationManager:
|
|
45 |
|
46 |
def get_train_token_config(self) -> BanTokenTrainConfig:
|
47 |
config = self.config.train_tokenize
|
48 |
-
params = self.params
|
49 |
create_directories([config.root_dir])
|
50 |
|
51 |
train_token_config = BanTokenTrainConfig(
|
@@ -60,7 +60,7 @@ class ConfigurationManager:
|
|
60 |
|
61 |
def get_model_trainer_config(self) -> ModelTrainingConfig:
|
62 |
config = self.config.model_training
|
63 |
-
param = self.params
|
64 |
create_directories([config.root_dir])
|
65 |
model_trainer_config = ModelTrainingConfig(
|
66 |
root_dir= config.root_dir,
|
|
|
30 |
|
31 |
def get_ben_tokenization_config(self) -> BanTokenizationConfig:
|
32 |
config = self.config.ban_tokenization
|
33 |
+
params = self.params
|
34 |
create_directories([config.root_dir])
|
35 |
|
36 |
ben_tokenization_config = BanTokenizationConfig(
|
|
|
45 |
|
46 |
def get_train_token_config(self) -> BanTokenTrainConfig:
|
47 |
config = self.config.train_tokenize
|
48 |
+
params = self.params
|
49 |
create_directories([config.root_dir])
|
50 |
|
51 |
train_token_config = BanTokenTrainConfig(
|
|
|
60 |
|
61 |
def get_model_trainer_config(self) -> ModelTrainingConfig:
|
62 |
config = self.config.model_training
|
63 |
+
param = self.params
|
64 |
create_directories([config.root_dir])
|
65 |
model_trainer_config = ModelTrainingConfig(
|
66 |
root_dir= config.root_dir,
|