Ayushk44's picture
Add files using upload-large-folder tool
213e2b5 verified
raw
history blame
37.7 kB
run_name: OLMo-99M-17values
seed: 6198
epoch: null
dry_run: false
model:
d_model: 512
n_heads: 8
n_kv_heads: null
clip_qkv: null
n_layers: 16
mlp_ratio: 4
mlp_hidden_size: 2560
activation_type: swiglu
block_type: sequential
block_group_size: 1
alibi: false
alibi_bias_max: 8.0
rope: true
rope_full_precision: true
rope_theta: 10000
flash_attention: false
attention_dropout: 0.0
multi_query_attention: false
attention_layer_norm: false
residual_dropout: 0.0
embedding_dropout: 0.0
embedding_layer_norm: false
layer_norm_type: rms
layer_norm_with_affine: true
layer_norm_eps: 1.0e-05
attention_layer_norm_with_affine: false
max_sequence_length: 2048
include_bias: false
bias_for_layer_norm: false
scale_logits: false
vocab_size: 50277
embedding_size: 50304
weight_tying: false
eos_token_id: 50277
pad_token_id: 1
init_device: meta
init_fn: mitchell
init_std: 0.02
init_cutoff_factor: null
precision: amp_bf16
scale_emb_init: false
emb_init_std: null
norm_after: false
linear_type: values17
num_trilm_matrix_scales: 1
optimizer:
name: adamw
learning_rate: 0.0004
weight_decay: 0.1
betas:
- 0.9
- 0.95
eps: 1.0e-05
no_decay_norm_and_bias: null
selective_updates: false
decay_norm_and_bias: false
decay_embeddings: false
metrics_log_interval: 10
record_update_metrics: false
scheduler:
name: cosine_with_warmup
units: steps
t_warmup: 375
t_max: null
alpha_f: 0.1
grad_clip_warmup_steps: null
grad_clip_warmup_factor: null
warmup_min_lr: null
remove_weight_decay_in_second_half: false
data:
paths:
- ../slimp/train/all_combined/combined_1/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_1/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_2/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_3/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_4/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_5/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_6/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_7/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_8/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_9/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamaarxiv_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamabook_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamabook_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamac4_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamac4_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamac4_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamac4_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamac4_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamac4_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamac4_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamac4_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_10.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_11.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_12.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_13.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_14.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_15.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_3.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_4.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_5.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_6.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_7.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_8.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamacommoncrawl_9.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamagithub_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamagithub_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamastackexchange_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamawikipedia_1.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
- ../slimp/train/all_combined/combined_10/redpajamawikipedia_2.jsonl.gz.tokenized.spectra.out.npy/0_00000.npy
memmap_dtype: uint16
datasets: null
label_mask_paths: null
pad_direction: right
generate_attention_mask: false
generate_doc_lengths: false
num_workers: 0
drop_last: true
pin_memory: true
prefetch_factor: 16
persistent_workers: true
timeout: 0
seed: null
instance_filter: null
restore_dataloader: true
fast_forward_batches: null
evaluators: []
eval_interval: 1000
tokenizer:
identifier: ../spectra_tokenizer/tokenizer.json
truncate_direction: right
save_folder: checkpoints/olmo-99M-17values/
remote_save_folder: null
canceled_check_interval: 50
save_interval: 1000
save_interval_unsharded: 10000
save_interval_ephemeral: null
save_num_checkpoints_to_keep: 4
save_num_unsharded_checkpoints_to_keep: -1
save_overwrite: true
force_save_unsharded: false
no_pre_train_checkpoint: false
load_path: null
load_path_sharded_checkpointer: null
try_load_latest_save: false
reset_optimizer_state: false
reset_trainer_state: false
sharded_checkpointer: torch_legacy
new_style_checkpoints: null
max_duration: 150000
global_train_batch_size: 1024
device_train_batch_size: 8
device_train_microbatch_size: 8
device_eval_batch_size: 8
eval_subset_num_batches: -1
eval_on_load: false
device_train_grad_accum: 1
max_grad_norm: 1.0
max_grad_norm_ratio: null
precision: amp_bf16
wandb: null
speed_monitor:
window_size: 20
gpu_flops_available: null
console_log_interval: 1
gen1_gc_interval: 1
compile: null
distributed_strategy: fsdp
fsdp:
use_orig_params: true
sharding_strategy: _HYBRID_SHARD_ZERO2
wrapping_strategy: null
precision: pure
hybrid_sharding_num_model_replicas: null
ddp: null
softmax_auxiliary_loss: false
auxiliary_loss_multiplier: 0.0001
time_limit: null
extra_steps_after_cancel: 10
early_stopping_factor: null
save_data_indices: true
python_profiling: false
torch_profiling: false
stop_at: 150010
stop_after: null
activation_checkpointing: null
fused_loss: null
hf_datasets_cache_dir: null
module_outputs_save_steps: null