{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1968503937007874, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 9.97353458404541, "learning_rate": 2.424749163879599e-05, "loss": 2.1561, "step": 10 }, { "epoch": 0.01, "grad_norm": 10.946741104125977, "learning_rate": 2.3411371237458197e-05, "loss": 1.6451, "step": 20 }, { "epoch": 0.02, "grad_norm": 10.761913299560547, "learning_rate": 2.2575250836120402e-05, "loss": 1.4919, "step": 30 }, { "epoch": 0.03, "grad_norm": 15.36636734008789, "learning_rate": 2.173913043478261e-05, "loss": 1.5464, "step": 40 }, { "epoch": 0.03, "grad_norm": 9.350459098815918, "learning_rate": 2.090301003344482e-05, "loss": 1.5388, "step": 50 }, { "epoch": 0.04, "grad_norm": 10.499181747436523, "learning_rate": 2.0066889632107023e-05, "loss": 1.5963, "step": 60 }, { "epoch": 0.05, "grad_norm": 10.65820598602295, "learning_rate": 1.923076923076923e-05, "loss": 1.5128, "step": 70 }, { "epoch": 0.05, "grad_norm": 9.002087593078613, "learning_rate": 1.8394648829431436e-05, "loss": 1.609, "step": 80 }, { "epoch": 0.06, "grad_norm": 8.280915260314941, "learning_rate": 1.7558528428093644e-05, "loss": 1.4611, "step": 90 }, { "epoch": 0.07, "grad_norm": 8.664236068725586, "learning_rate": 1.6722408026755853e-05, "loss": 1.4765, "step": 100 }, { "epoch": 0.07, "grad_norm": 8.205462455749512, "learning_rate": 1.588628762541806e-05, "loss": 1.3903, "step": 110 }, { "epoch": 0.08, "grad_norm": 8.982340812683105, "learning_rate": 1.5050167224080269e-05, "loss": 1.3134, "step": 120 }, { "epoch": 0.09, "grad_norm": 7.957040786743164, "learning_rate": 1.4214046822742474e-05, "loss": 1.3256, "step": 130 }, { "epoch": 0.09, "grad_norm": 8.92203426361084, "learning_rate": 1.3377926421404682e-05, "loss": 1.5535, "step": 140 }, { "epoch": 0.1, "grad_norm": 6.650272369384766, "learning_rate": 1.254180602006689e-05, "loss": 1.5439, "step": 150 }, { "epoch": 0.1, "grad_norm": 12.875391006469727, "learning_rate": 1.1705685618729099e-05, "loss": 1.3592, "step": 160 }, { "epoch": 0.11, "grad_norm": 11.99698257446289, "learning_rate": 1.0869565217391305e-05, "loss": 1.4613, "step": 170 }, { "epoch": 0.12, "grad_norm": 6.687454700469971, "learning_rate": 1.0033444816053512e-05, "loss": 1.2539, "step": 180 }, { "epoch": 0.12, "grad_norm": 7.108691692352295, "learning_rate": 9.197324414715718e-06, "loss": 1.4818, "step": 190 }, { "epoch": 0.13, "grad_norm": 7.420890808105469, "learning_rate": 8.361204013377926e-06, "loss": 1.332, "step": 200 }, { "epoch": 0.14, "grad_norm": 7.554871082305908, "learning_rate": 7.5250836120401346e-06, "loss": 1.3705, "step": 210 }, { "epoch": 0.14, "grad_norm": 8.120746612548828, "learning_rate": 6.688963210702341e-06, "loss": 1.3652, "step": 220 }, { "epoch": 0.15, "grad_norm": 7.490564823150635, "learning_rate": 5.852842809364549e-06, "loss": 1.3594, "step": 230 }, { "epoch": 0.16, "grad_norm": 9.032505989074707, "learning_rate": 5.016722408026756e-06, "loss": 1.2791, "step": 240 }, { "epoch": 0.16, "grad_norm": 7.20292329788208, "learning_rate": 4.180602006688963e-06, "loss": 1.4569, "step": 250 }, { "epoch": 0.17, "grad_norm": 9.031627655029297, "learning_rate": 3.3444816053511705e-06, "loss": 1.3965, "step": 260 }, { "epoch": 0.18, "grad_norm": 7.822471618652344, "learning_rate": 2.508361204013378e-06, "loss": 1.418, "step": 270 }, { "epoch": 0.18, "grad_norm": 15.259939193725586, "learning_rate": 1.6722408026755853e-06, "loss": 1.3933, "step": 280 }, { "epoch": 0.19, "grad_norm": 9.465089797973633, "learning_rate": 8.361204013377926e-07, "loss": 1.3814, "step": 290 }, { "epoch": 0.2, "grad_norm": 6.676379680633545, "learning_rate": 0.0, "loss": 1.1996, "step": 300 } ], "logging_steps": 10, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 3315779135078400.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }