{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8243488794669895, "eval_steps": 500, "global_step": 4518, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04037956793862306, "grad_norm": 3.5625, "learning_rate": 5.878836833602585e-05, "loss": 0.1271, "step": 100 }, { "epoch": 0.08075913587724612, "grad_norm": 2.953125, "learning_rate": 5.7576736672051694e-05, "loss": 0.0977, "step": 200 }, { "epoch": 0.12113870381586916, "grad_norm": 1.7265625, "learning_rate": 5.636510500807755e-05, "loss": 0.077, "step": 300 }, { "epoch": 0.16151827175449224, "grad_norm": 4.90625, "learning_rate": 5.5153473344103394e-05, "loss": 0.0692, "step": 400 }, { "epoch": 0.20189783969311528, "grad_norm": 4.15625, "learning_rate": 5.394184168012924e-05, "loss": 0.0566, "step": 500 }, { "epoch": 0.24227740763173833, "grad_norm": 2.421875, "learning_rate": 5.2730210016155086e-05, "loss": 0.0594, "step": 600 }, { "epoch": 0.2826569755703614, "grad_norm": 2.109375, "learning_rate": 5.1518578352180936e-05, "loss": 0.0591, "step": 700 }, { "epoch": 0.3230365435089845, "grad_norm": 3.21875, "learning_rate": 5.030694668820679e-05, "loss": 0.068, "step": 800 }, { "epoch": 0.3634161114476075, "grad_norm": 2.921875, "learning_rate": 4.9095315024232635e-05, "loss": 0.0563, "step": 900 }, { "epoch": 0.40379567938623057, "grad_norm": 1.0, "learning_rate": 4.7883683360258485e-05, "loss": 0.0562, "step": 1000 }, { "epoch": 0.44417524732485364, "grad_norm": 2.46875, "learning_rate": 4.667205169628433e-05, "loss": 0.0601, "step": 1100 }, { "epoch": 0.48455481526347666, "grad_norm": 1.6953125, "learning_rate": 4.546042003231018e-05, "loss": 0.0455, "step": 1200 }, { "epoch": 0.5249343832020997, "grad_norm": 1.3359375, "learning_rate": 4.424878836833603e-05, "loss": 0.0472, "step": 1300 }, { "epoch": 0.5653139511407228, "grad_norm": 1.78125, "learning_rate": 4.303715670436188e-05, "loss": 0.0561, "step": 1400 }, { "epoch": 0.6056935190793459, "grad_norm": 1.390625, "learning_rate": 4.1825525040387727e-05, "loss": 0.0517, "step": 1500 }, { "epoch": 0.646073087017969, "grad_norm": 0.90625, "learning_rate": 4.061389337641357e-05, "loss": 0.0449, "step": 1600 }, { "epoch": 0.6864526549565919, "grad_norm": 3.28125, "learning_rate": 3.940226171243942e-05, "loss": 0.0425, "step": 1700 }, { "epoch": 0.726832222895215, "grad_norm": 1.546875, "learning_rate": 3.819063004846526e-05, "loss": 0.0462, "step": 1800 }, { "epoch": 0.7672117908338381, "grad_norm": 1.53125, "learning_rate": 3.697899838449112e-05, "loss": 0.0488, "step": 1900 }, { "epoch": 0.8075913587724611, "grad_norm": 5.28125, "learning_rate": 3.576736672051697e-05, "loss": 0.0447, "step": 2000 }, { "epoch": 0.8479709267110842, "grad_norm": 2.859375, "learning_rate": 3.455573505654281e-05, "loss": 0.0477, "step": 2100 }, { "epoch": 0.8883504946497073, "grad_norm": 2.4375, "learning_rate": 3.334410339256866e-05, "loss": 0.0489, "step": 2200 }, { "epoch": 0.9287300625883304, "grad_norm": 0.69921875, "learning_rate": 3.2132471728594504e-05, "loss": 0.0508, "step": 2300 }, { "epoch": 0.9691096305269533, "grad_norm": 2.34375, "learning_rate": 3.092084006462036e-05, "loss": 0.0396, "step": 2400 }, { "epoch": 1.0094891984655765, "grad_norm": 1.6015625, "learning_rate": 2.9709208400646203e-05, "loss": 0.0486, "step": 2500 }, { "epoch": 1.0498687664041995, "grad_norm": 4.6875, "learning_rate": 2.8497576736672053e-05, "loss": 0.0349, "step": 2600 }, { "epoch": 1.0902483343428226, "grad_norm": 0.671875, "learning_rate": 2.72859450726979e-05, "loss": 0.0277, "step": 2700 }, { "epoch": 1.1306279022814456, "grad_norm": 1.1484375, "learning_rate": 2.607431340872375e-05, "loss": 0.0291, "step": 2800 }, { "epoch": 1.1710074702200686, "grad_norm": 3.890625, "learning_rate": 2.4862681744749595e-05, "loss": 0.0327, "step": 2900 }, { "epoch": 1.2113870381586918, "grad_norm": 1.2109375, "learning_rate": 2.3651050080775445e-05, "loss": 0.0324, "step": 3000 }, { "epoch": 1.2517666060973147, "grad_norm": 1.953125, "learning_rate": 2.2439418416801295e-05, "loss": 0.0337, "step": 3100 }, { "epoch": 1.292146174035938, "grad_norm": 3.46875, "learning_rate": 2.122778675282714e-05, "loss": 0.0318, "step": 3200 }, { "epoch": 1.3325257419745609, "grad_norm": 0.703125, "learning_rate": 2.0016155088852987e-05, "loss": 0.0347, "step": 3300 }, { "epoch": 1.3729053099131838, "grad_norm": 2.875, "learning_rate": 1.8804523424878837e-05, "loss": 0.0299, "step": 3400 }, { "epoch": 1.413284877851807, "grad_norm": 2.078125, "learning_rate": 1.7592891760904683e-05, "loss": 0.0292, "step": 3500 }, { "epoch": 1.45366444579043, "grad_norm": 0.80078125, "learning_rate": 1.6381260096930536e-05, "loss": 0.0321, "step": 3600 }, { "epoch": 1.4940440137290532, "grad_norm": 2.359375, "learning_rate": 1.5169628432956381e-05, "loss": 0.0305, "step": 3700 }, { "epoch": 1.5344235816676761, "grad_norm": 0.6171875, "learning_rate": 1.395799676898223e-05, "loss": 0.0289, "step": 3800 }, { "epoch": 1.574803149606299, "grad_norm": 2.765625, "learning_rate": 1.2746365105008077e-05, "loss": 0.0304, "step": 3900 }, { "epoch": 1.6151827175449223, "grad_norm": 1.78125, "learning_rate": 1.1534733441033925e-05, "loss": 0.0346, "step": 4000 }, { "epoch": 1.6555622854835454, "grad_norm": 1.0, "learning_rate": 1.0323101777059775e-05, "loss": 0.033, "step": 4100 }, { "epoch": 1.6959418534221684, "grad_norm": 1.8828125, "learning_rate": 9.111470113085623e-06, "loss": 0.0328, "step": 4200 }, { "epoch": 1.7363214213607914, "grad_norm": 1.40625, "learning_rate": 7.89983844911147e-06, "loss": 0.0306, "step": 4300 }, { "epoch": 1.7767009892994143, "grad_norm": 3.375, "learning_rate": 6.6882067851373186e-06, "loss": 0.0303, "step": 4400 }, { "epoch": 1.8170805572380375, "grad_norm": 1.328125, "learning_rate": 5.4765751211631666e-06, "loss": 0.0312, "step": 4500 } ], "logging_steps": 100, "max_steps": 4952, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 502, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }