cfg: micro_batch_size: 20 global_batch_size: 8000 tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 encoder_seq_length: 512 max_position_embeddings: 512 num_layers: 24 hidden_size: 1024 ffn_hidden_size: 4096 num_attention_heads: 16 init_method_std: 0.02 hidden_dropout: 0.1 kv_channels: null apply_query_key_layer_scaling: true layernorm_epsilon: 1.0e-05 make_vocab_size_divisible_by: 128 pre_process: true post_process: true bert_binary_head: true tokenizer: library: huggingface type: KBLab/wordpiece-32k-pretok-small_data-tokenizer model: null vocab_file: null merge_file: null native_amp_init_scale: 4294967296 native_amp_growth_interval: 1000 fp32_residual_connection: false fp16_lm_cross_entropy: false megatron_amp_O2: false grad_allreduce_chunk_size_mb: 125 grad_div_ar_fusion: false seed: 666 use_cpu_initialization: false onnx_safe: false gradient_as_bucket_view: true activations_checkpoint_granularity: null activations_checkpoint_method: null activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false data: data_prefix: - 1 - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/wikipedia-wordpiece-32k-pretok-small_data_text_sentence - 1 - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/edepos_html-wordpiece-32k-pretok-small_data_text_sentence - 1 - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/oscar-wordpiece-32k-pretok-small_data_text_sentence - 1 - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/kw3-2017-wordpiece-32k-pretok-small_data_text_sentence - 1 - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/issues-wordpiece-32k-pretok-small_data_text_sentence - 1 - /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/mc4-wordpiece-32k-pretok-small_data_text_sentence index_mapping_dir: /project/scratch/$PID/data/wordpiece-32k-pretok-small_data/npy_files/ data_impl: mmap splits_string: 980,10,10 seq_length: 512 skip_warmup: true num_workers: 32 dataloader_type: single reset_position_ids: false reset_attention_mask: false eod_mask_loss: false masked_lm_prob: 0.15 short_seq_prob: 0.1 optim: name: fused_adam lr: 0.0006 weight_decay: 0.01 betas: - 0.9 - 0.98 sched: name: CosineAnnealing warmup_steps: 500 constant_steps: 500 min_lr: 2.0e-05 precision: 16