Add dataset creation script
Browse files- src/__pycache__/data_utils.cpython-38.pyc +0 -0
- src/__pycache__/dictionary.cpython-38.pyc +0 -0
- src/__pycache__/normalizer.cpython-38.pyc +0 -0
- src/create_dataset.py +136 -0
- src/data_utils.py +5 -8
- src/regexes/__pycache__/__init__.cpython-38.pyc +0 -0
- src/regexes/__pycache__/currency.cpython-38.pyc +0 -0
- src/regexes/__pycache__/email.cpython-38.pyc +0 -0
- src/regexes/__pycache__/latin.cpython-38.pyc +0 -0
- src/regexes/__pycache__/number.cpython-38.pyc +0 -0
- src/regexes/__pycache__/persian.cpython-38.pyc +0 -0
- src/regexes/__pycache__/phone.cpython-38.pyc +0 -0
- src/regexes/__pycache__/punk.cpython-38.pyc +0 -0
- src/regexes/__pycache__/quote.cpython-38.pyc +0 -0
- src/regexes/__pycache__/url.cpython-38.pyc +0 -0
- src/run.sh +37 -18
- src/run_clm_flax.py +9 -8
- src/run_dataset.sh +13 -0
src/__pycache__/data_utils.cpython-38.pyc
CHANGED
Binary files a/src/__pycache__/data_utils.cpython-38.pyc and b/src/__pycache__/data_utils.cpython-38.pyc differ
|
|
src/__pycache__/dictionary.cpython-38.pyc
CHANGED
Binary files a/src/__pycache__/dictionary.cpython-38.pyc and b/src/__pycache__/dictionary.cpython-38.pyc differ
|
|
src/__pycache__/normalizer.cpython-38.pyc
CHANGED
Binary files a/src/__pycache__/normalizer.cpython-38.pyc and b/src/__pycache__/normalizer.cpython-38.pyc differ
|
|
src/create_dataset.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
from dataclasses import dataclass, field
|
6 |
+
import pandas as pd
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from tqdm import tqdm
|
9 |
+
from typing import Dict, List, Optional, Tuple
|
10 |
+
from datasets import load_dataset
|
11 |
+
from transformers import (
|
12 |
+
HfArgumentParser,
|
13 |
+
)
|
14 |
+
from data_utils import (
|
15 |
+
filter_by_lang_regex,
|
16 |
+
filter_by_num_tokens,
|
17 |
+
filter_by_num_sents,
|
18 |
+
filter_by_adv,
|
19 |
+
normalizer
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
logger = logging.getLogger(__name__)
|
24 |
+
|
25 |
+
|
26 |
+
@dataclass
|
27 |
+
class DataArguments:
|
28 |
+
"""
|
29 |
+
Arguments to which dataset we are going to set up.
|
30 |
+
"""
|
31 |
+
output_dir: str = field(
|
32 |
+
default=".",
|
33 |
+
metadata={"help": "The output directory where the config will be written."},
|
34 |
+
)
|
35 |
+
dataset_name: str = field(
|
36 |
+
default=None,
|
37 |
+
metadata={"help": "The name of the dataset to use (via the datasets library)."}
|
38 |
+
)
|
39 |
+
dataset_config_name: Optional[str] = field(
|
40 |
+
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
41 |
+
)
|
42 |
+
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
43 |
+
cache_dir: Optional[str] = field(
|
44 |
+
default=None,
|
45 |
+
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
46 |
+
)
|
47 |
+
def main():
|
48 |
+
parser = HfArgumentParser([DataArguments])
|
49 |
+
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
50 |
+
# If we pass only one argument to the script and it's the path to a json file,
|
51 |
+
# let's parse it to get our arguments.
|
52 |
+
data_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
|
53 |
+
else:
|
54 |
+
data_args = parser.parse_args_into_dataclasses()[0]
|
55 |
+
# Setup logging
|
56 |
+
logging.basicConfig(
|
57 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
58 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
59 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
60 |
+
)
|
61 |
+
logger.setLevel(logging.INFO)
|
62 |
+
logger.info(f"Preparing the dataset")
|
63 |
+
if data_args.dataset_name is not None:
|
64 |
+
dataset = load_dataset(
|
65 |
+
data_args.dataset_name,
|
66 |
+
data_args.dataset_config_name,
|
67 |
+
cache_dir=data_args.cache_dir,
|
68 |
+
split="train"
|
69 |
+
)
|
70 |
+
else:
|
71 |
+
data_files = {"train": data_args.train_file}
|
72 |
+
extension = data_args.train_file.split(".")[-1]
|
73 |
+
if extension == "txt":
|
74 |
+
extension = "text"
|
75 |
+
|
76 |
+
dataset = load_dataset(
|
77 |
+
extension,
|
78 |
+
data_files=data_files,
|
79 |
+
delimiter="\t",
|
80 |
+
cache_dir=data_args.cache_dir,
|
81 |
+
)
|
82 |
+
|
83 |
+
logger.info(f"dataset: {dataset}")
|
84 |
+
|
85 |
+
def data_preparation(item_dict):
|
86 |
+
if "text" not in item_dict:
|
87 |
+
return None
|
88 |
+
|
89 |
+
text = item_dict["text"]
|
90 |
+
|
91 |
+
status = filter_by_lang_regex(text, ratio=0.75)
|
92 |
+
if not status:
|
93 |
+
return None
|
94 |
+
|
95 |
+
status = filter_by_num_tokens(text, gt=64)
|
96 |
+
if not status:
|
97 |
+
return None
|
98 |
+
|
99 |
+
status = filter_by_num_sents(text, gt=2)
|
100 |
+
if not status:
|
101 |
+
return None
|
102 |
+
|
103 |
+
status = filter_by_adv(text, ratio=50)
|
104 |
+
if not status:
|
105 |
+
return None
|
106 |
+
|
107 |
+
text = normalizer(text)
|
108 |
+
return {"text": text}
|
109 |
+
|
110 |
+
data_dict = []
|
111 |
+
for item in tqdm(dataset, position=0, total=len(dataset)):
|
112 |
+
item = data_preparation(item)
|
113 |
+
|
114 |
+
if item:
|
115 |
+
data_dict.append(item)
|
116 |
+
|
117 |
+
data_df = pd.DataFrame(data_dict)
|
118 |
+
|
119 |
+
logger.info(f"Preparation - [before] consists of {len(dataset)} records!")
|
120 |
+
logger.info(f"Preparation - [after] consists of {len(data_df)} records!")
|
121 |
+
|
122 |
+
train, test = train_test_split(data_df, test_size=0.01, random_state=101)
|
123 |
+
|
124 |
+
train = train.reset_index(drop=True)
|
125 |
+
test = test.reset_index(drop=True)
|
126 |
+
|
127 |
+
logger.info(f"Preparation of [train] set consists of {len(train)} records!")
|
128 |
+
logger.info(f"Preparation of [test] set consists of {len(test)} records!")
|
129 |
+
|
130 |
+
os.makedirs(data_args.output_dir, exist_ok=True)
|
131 |
+
train.to_csv(os.path.join(data_args.output_dir, "train.csv"), sep="\t", encoding="utf-8", index=False)
|
132 |
+
test.to_csv(os.path.join(data_args.output_dir, "test.csv"), sep="\t", encoding="utf-8", index=False)
|
133 |
+
logger.info(f"Data saved here {data_args.output_dir}")
|
134 |
+
|
135 |
+
if __name__ == '__main__':
|
136 |
+
main()
|
src/data_utils.py
CHANGED
@@ -32,14 +32,11 @@ def filter_by_adv(text, ratio=50):
|
|
32 |
return length_add < ratio
|
33 |
|
34 |
|
35 |
-
|
36 |
-
|
37 |
|
38 |
-
|
39 |
-
|
40 |
|
41 |
-
|
42 |
-
def normalizer(example):
|
43 |
-
example["text"] = normalize(example["text"])
|
44 |
-
return example
|
45 |
|
|
|
32 |
return length_add < ratio
|
33 |
|
34 |
|
35 |
+
def normalizer(text, do_lowercase=False):
|
36 |
+
text = normalize(text)
|
37 |
|
38 |
+
if do_lowercase:
|
39 |
+
text = text.lower()
|
40 |
|
41 |
+
return text
|
|
|
|
|
|
|
42 |
|
src/regexes/__pycache__/__init__.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/__init__.cpython-38.pyc and b/src/regexes/__pycache__/__init__.cpython-38.pyc differ
|
|
src/regexes/__pycache__/currency.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/currency.cpython-38.pyc and b/src/regexes/__pycache__/currency.cpython-38.pyc differ
|
|
src/regexes/__pycache__/email.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/email.cpython-38.pyc and b/src/regexes/__pycache__/email.cpython-38.pyc differ
|
|
src/regexes/__pycache__/latin.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/latin.cpython-38.pyc and b/src/regexes/__pycache__/latin.cpython-38.pyc differ
|
|
src/regexes/__pycache__/number.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/number.cpython-38.pyc and b/src/regexes/__pycache__/number.cpython-38.pyc differ
|
|
src/regexes/__pycache__/persian.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/persian.cpython-38.pyc and b/src/regexes/__pycache__/persian.cpython-38.pyc differ
|
|
src/regexes/__pycache__/phone.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/phone.cpython-38.pyc and b/src/regexes/__pycache__/phone.cpython-38.pyc differ
|
|
src/regexes/__pycache__/punk.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/punk.cpython-38.pyc and b/src/regexes/__pycache__/punk.cpython-38.pyc differ
|
|
src/regexes/__pycache__/quote.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/quote.cpython-38.pyc and b/src/regexes/__pycache__/quote.cpython-38.pyc differ
|
|
src/regexes/__pycache__/url.cpython-38.pyc
CHANGED
Binary files a/src/regexes/__pycache__/url.cpython-38.pyc and b/src/regexes/__pycache__/url.cpython-38.pyc differ
|
|
src/run.sh
CHANGED
@@ -3,17 +3,17 @@
|
|
3 |
export LC_ALL=C.UTF-8
|
4 |
export LANG=C.UTF-8
|
5 |
|
6 |
-
|
7 |
-
export OUTPUT_DIR=/home/
|
8 |
-
export MODEL_TYPE=gpt2
|
9 |
-
export CONFIG_NAME=/home/
|
10 |
-
export TOKENIZER_NAME=/home/
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
#export TEST_FILE=/home/
|
15 |
-
export DATASET_NAME=oscar
|
16 |
-
export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
|
17 |
export MAX_SEQUENCE_LENGTH=512
|
18 |
|
19 |
#export MAX_TRAIN_SAMPLE=5000
|
@@ -21,8 +21,8 @@ export MAX_SEQUENCE_LENGTH=512
|
|
21 |
|
22 |
export PER_DEVICE_TRAIN_BATCH_SIZE=16
|
23 |
export PER_DEVICE_EVAL_BATCH_SIZE=16
|
24 |
-
export NUM_TRAIN_EPOCHS=
|
25 |
-
export LEARNING_RATE=
|
26 |
export WARMUP_STEPS=5000
|
27 |
export LOGGING_STEPS=500
|
28 |
export EVAL_STEPS=2500
|
@@ -30,11 +30,9 @@ export SAVE_STEPS=2500
|
|
30 |
|
31 |
python src/run_clm_flax.py \
|
32 |
--output_dir="$OUTPUT_DIR" \
|
33 |
-
--
|
34 |
-
--
|
35 |
-
--
|
36 |
-
--dataset_name="$DATASET_NAME" \
|
37 |
-
--dataset_config_name="$DATASET_CONFIG_NAME" \
|
38 |
--block_size=$MAX_SEQUENCE_LENGTH \
|
39 |
--per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
|
40 |
--per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
|
@@ -47,4 +45,25 @@ python src/run_clm_flax.py \
|
|
47 |
--do_train \
|
48 |
--do_eval \
|
49 |
--overwrite_output_dir \
|
50 |
-
--push_to_hub
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
export LC_ALL=C.UTF-8
|
4 |
export LANG=C.UTF-8
|
5 |
|
6 |
+
export MODEL_NAME_OR_PATH=/home/m3hrdadfi/code/gpt2-medium-persian
|
7 |
+
export OUTPUT_DIR=/home/m3hrdadfi/code/gpt2-medium-persian
|
8 |
+
# export MODEL_TYPE=gpt2
|
9 |
+
# export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
10 |
+
# export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
11 |
|
12 |
+
export TRAIN_FILE=/home/m3hrdadfi/data/train.csv
|
13 |
+
export VALIDATION_FILE=/home/m3hrdadfi/data/test.csv
|
14 |
+
#export TEST_FILE=/home/m3hrdadfi/code/data/...csv
|
15 |
+
# export DATASET_NAME=oscar
|
16 |
+
# export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
|
17 |
export MAX_SEQUENCE_LENGTH=512
|
18 |
|
19 |
#export MAX_TRAIN_SAMPLE=5000
|
|
|
21 |
|
22 |
export PER_DEVICE_TRAIN_BATCH_SIZE=16
|
23 |
export PER_DEVICE_EVAL_BATCH_SIZE=16
|
24 |
+
export NUM_TRAIN_EPOCHS=9.0
|
25 |
+
export LEARNING_RATE=8e-4
|
26 |
export WARMUP_STEPS=5000
|
27 |
export LOGGING_STEPS=500
|
28 |
export EVAL_STEPS=2500
|
|
|
30 |
|
31 |
python src/run_clm_flax.py \
|
32 |
--output_dir="$OUTPUT_DIR" \
|
33 |
+
--model_name_or_path="$MODEL_NAME_OR_PATH" \
|
34 |
+
--train_file="$TRAIN_FILE" \
|
35 |
+
--validation_file="$VALIDATION_FILE" \
|
|
|
|
|
36 |
--block_size=$MAX_SEQUENCE_LENGTH \
|
37 |
--per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
|
38 |
--per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
|
|
|
45 |
--do_train \
|
46 |
--do_eval \
|
47 |
--overwrite_output_dir \
|
48 |
+
--push_to_hub
|
49 |
+
|
50 |
+
# python src/run_clm_flax.py \
|
51 |
+
# --output_dir="$OUTPUT_DIR" \
|
52 |
+
# --model_type="$MODEL_TYPE" \
|
53 |
+
# --config_name="$CONFIG_NAME" \
|
54 |
+
# --tokenizer_name="$TOKENIZER_NAME" \
|
55 |
+
# --dataset_name="$DATASET_NAME" \
|
56 |
+
# --dataset_config_name="$DATASET_CONFIG_NAME" \
|
57 |
+
# --block_size=$MAX_SEQUENCE_LENGTH \
|
58 |
+
# --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
|
59 |
+
# --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
|
60 |
+
# --num_train_epochs=$NUM_TRAIN_EPOCHS \
|
61 |
+
# --learning_rate=$LEARNING_RATE \
|
62 |
+
# --warmup_steps=$WARMUP_STEPS \
|
63 |
+
# --logging_step=$LOGGING_STEPS \
|
64 |
+
# --eval_steps=$EVAL_STEPS \
|
65 |
+
# --save_steps=$SAVE_STEPS \
|
66 |
+
# --do_train \
|
67 |
+
# --do_eval \
|
68 |
+
# --overwrite_output_dir \
|
69 |
+
# --push_to_hub
|
src/run_clm_flax.py
CHANGED
@@ -358,14 +358,15 @@ def main():
|
|
358 |
|
359 |
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
360 |
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
361 |
-
logger.info("Preprocessing the dataset")
|
362 |
-
dataset = raw_dataset.filter(lambda example: filter_by_lang_regex(example["text"], ratio=0.75))
|
363 |
-
dataset = dataset.filter(lambda example: filter_by_num_tokens(example["text"], gt=64))
|
364 |
-
dataset = dataset.filter(lambda example: filter_by_num_sents(example["text"], gt=2))
|
365 |
-
dataset = dataset.filter(lambda example: filter_by_adv(example["text"], ratio=50))
|
366 |
-
dataset = dataset.map(normalizer)
|
367 |
-
logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
|
368 |
-
|
|
|
369 |
# Load pretrained model and tokenizer
|
370 |
|
371 |
# Distributed training:
|
|
|
358 |
|
359 |
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
360 |
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
361 |
+
# logger.info("Preprocessing the dataset")
|
362 |
+
# dataset = raw_dataset.filter(lambda example: filter_by_lang_regex(example["text"], ratio=0.75))
|
363 |
+
# dataset = dataset.filter(lambda example: filter_by_num_tokens(example["text"], gt=64))
|
364 |
+
# dataset = dataset.filter(lambda example: filter_by_num_sents(example["text"], gt=2))
|
365 |
+
# dataset = dataset.filter(lambda example: filter_by_adv(example["text"], ratio=50))
|
366 |
+
# dataset = dataset.map(normalizer)
|
367 |
+
# logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
|
368 |
+
dataset = raw_dataset
|
369 |
+
|
370 |
# Load pretrained model and tokenizer
|
371 |
|
372 |
# Distributed training:
|
src/run_dataset.sh
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
export LC_ALL=C.UTF-8
|
4 |
+
export LANG=C.UTF-8
|
5 |
+
|
6 |
+
export OUTPUT_DIR=/home/m3hrdadfi/data/
|
7 |
+
export DATASET_NAME=oscar
|
8 |
+
export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
|
9 |
+
|
10 |
+
python src/create_dataset.py \
|
11 |
+
--output_dir="$OUTPUT_DIR" \
|
12 |
+
--dataset_name="$DATASET_NAME" \
|
13 |
+
--dataset_config_name="$DATASET_CONFIG_NAME"
|