## IO | |
save_data: en-zh/data_spm | |
overwrite: True | |
seed: 1234 | |
report_every: 100 | |
valid_metrics: ["BLEU"] | |
tensorboard: true | |
tensorboard_log_dir: tensorboard | |
### Vocab | |
src_vocab: en-zh/src.eole.vocab | |
tgt_vocab: en-zh/tgt.eole.vocab | |
src_vocab_size: 32000 | |
tgt_vocab_size: 32000 | |
vocab_size_multiple: 8 | |
share_vocab: False | |
n_sample: 0 | |
data: | |
corpus_1: | |
path_src: hf://quickmt/quickmt-train-zh-en/en | |
path_tgt: hf://quickmt/quickmt-train-zh-en/zh | |
path_sco: hf://quickmt/quickmt-train-zh-en/sco | |
valid: | |
path_src: en-zh/dev.eng | |
path_tgt: en-zh/dev.zho | |
transforms: [sentencepiece, filtertoolong] | |
transforms_configs: | |
sentencepiece: | |
src_subword_model: "en-zh/src.spm.model" | |
tgt_subword_model: "en-zh/tgt.spm.model" | |
filtertoolong: | |
src_seq_length: 256 | |
tgt_seq_length: 256 | |
training: | |
# Run configuration | |
model_path: en-zh/model | |
keep_checkpoint: 4 | |
save_checkpoint_steps: 2000 | |
train_steps: 100000 | |
valid_steps: 2000 | |
# Train on a single GPU | |
world_size: 1 | |
gpu_ranks: [0] | |
# Batching | |
batch_type: "tokens" | |
batch_size: 16384 | |
valid_batch_size: 16384 | |
batch_size_multiple: 8 | |
accum_count: [8] | |
accum_steps: [0] | |
# Optimizer & Compute | |
compute_dtype: "bf16" | |
optim: "pagedadamw8bit" | |
learning_rate: 2.0 | |
warmup_steps: 10000 | |
decay_method: "noam" | |
adam_beta2: 0.998 | |
# Data loading | |
bucket_size: 128000 | |
num_workers: 8 | |
prefetch_factor: 100 | |
# Hyperparams | |
dropout_steps: [0] | |
dropout: [0.1] | |
attention_dropout: [0.1] | |
max_grad_norm: 2 | |
label_smoothing: 0.1 | |
average_decay: 0.0001 | |
param_init_method: xavier_uniform | |
normalization: "tokens" | |
model: | |
architecture: "transformer" | |
layer_norm: standard | |
share_embeddings: false | |
share_decoder_embeddings: true | |
add_ffnbias: true | |
mlp_activation_fn: gelu | |
add_estimator: false | |
add_qkvbias: false | |
norm_eps: 1e-6 | |
hidden_size: 1024 | |
encoder: | |
layers: 8 | |
decoder: | |
layers: 2 | |
heads: 16 | |
transformer_ff: 4096 | |
embeddings: | |
word_vec_size: 1024 | |
position_encoding_type: "SinusoidalInterleaved" | |