|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.1968503937007874, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.97353458404541, |
|
"learning_rate": 2.424749163879599e-05, |
|
"loss": 2.1561, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.946741104125977, |
|
"learning_rate": 2.3411371237458197e-05, |
|
"loss": 1.6451, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.761913299560547, |
|
"learning_rate": 2.2575250836120402e-05, |
|
"loss": 1.4919, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 15.36636734008789, |
|
"learning_rate": 2.173913043478261e-05, |
|
"loss": 1.5464, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 9.350459098815918, |
|
"learning_rate": 2.090301003344482e-05, |
|
"loss": 1.5388, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 10.499181747436523, |
|
"learning_rate": 2.0066889632107023e-05, |
|
"loss": 1.5963, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 10.65820598602295, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 1.5128, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 9.002087593078613, |
|
"learning_rate": 1.8394648829431436e-05, |
|
"loss": 1.609, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 8.280915260314941, |
|
"learning_rate": 1.7558528428093644e-05, |
|
"loss": 1.4611, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 8.664236068725586, |
|
"learning_rate": 1.6722408026755853e-05, |
|
"loss": 1.4765, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 8.205462455749512, |
|
"learning_rate": 1.588628762541806e-05, |
|
"loss": 1.3903, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.982340812683105, |
|
"learning_rate": 1.5050167224080269e-05, |
|
"loss": 1.3134, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.957040786743164, |
|
"learning_rate": 1.4214046822742474e-05, |
|
"loss": 1.3256, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 8.92203426361084, |
|
"learning_rate": 1.3377926421404682e-05, |
|
"loss": 1.5535, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.650272369384766, |
|
"learning_rate": 1.254180602006689e-05, |
|
"loss": 1.5439, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 12.875391006469727, |
|
"learning_rate": 1.1705685618729099e-05, |
|
"loss": 1.3592, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 11.99698257446289, |
|
"learning_rate": 1.0869565217391305e-05, |
|
"loss": 1.4613, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.687454700469971, |
|
"learning_rate": 1.0033444816053512e-05, |
|
"loss": 1.2539, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.108691692352295, |
|
"learning_rate": 9.197324414715718e-06, |
|
"loss": 1.4818, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.420890808105469, |
|
"learning_rate": 8.361204013377926e-06, |
|
"loss": 1.332, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 7.554871082305908, |
|
"learning_rate": 7.5250836120401346e-06, |
|
"loss": 1.3705, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 8.120746612548828, |
|
"learning_rate": 6.688963210702341e-06, |
|
"loss": 1.3652, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.490564823150635, |
|
"learning_rate": 5.852842809364549e-06, |
|
"loss": 1.3594, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 9.032505989074707, |
|
"learning_rate": 5.016722408026756e-06, |
|
"loss": 1.2791, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 7.20292329788208, |
|
"learning_rate": 4.180602006688963e-06, |
|
"loss": 1.4569, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.031627655029297, |
|
"learning_rate": 3.3444816053511705e-06, |
|
"loss": 1.3965, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 7.822471618652344, |
|
"learning_rate": 2.508361204013378e-06, |
|
"loss": 1.418, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 15.259939193725586, |
|
"learning_rate": 1.6722408026755853e-06, |
|
"loss": 1.3933, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 9.465089797973633, |
|
"learning_rate": 8.361204013377926e-07, |
|
"loss": 1.3814, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.676379680633545, |
|
"learning_rate": 0.0, |
|
"loss": 1.1996, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"total_flos": 3315779135078400.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|