{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08680555555555555, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008680555555555555, "eval_loss": 1.937089443206787, "eval_runtime": 131.5855, "eval_samples_per_second": 14.743, "eval_steps_per_second": 1.847, "step": 1 }, { "epoch": 0.0026041666666666665, "grad_norm": 23.593791961669922, "learning_rate": 1.5e-05, "loss": 7.6661, "step": 3 }, { "epoch": 0.005208333333333333, "grad_norm": 19.38722801208496, "learning_rate": 3e-05, "loss": 7.2214, "step": 6 }, { "epoch": 0.0078125, "grad_norm": 8.94090747833252, "learning_rate": 4.5e-05, "loss": 5.5255, "step": 9 }, { "epoch": 0.0078125, "eval_loss": 1.2398158311843872, "eval_runtime": 133.0104, "eval_samples_per_second": 14.585, "eval_steps_per_second": 1.827, "step": 9 }, { "epoch": 0.010416666666666666, "grad_norm": 7.774077892303467, "learning_rate": 4.993910125649561e-05, "loss": 4.6268, "step": 12 }, { "epoch": 0.013020833333333334, "grad_norm": 4.864662170410156, "learning_rate": 4.962019382530521e-05, "loss": 3.9421, "step": 15 }, { "epoch": 0.015625, "grad_norm": 3.9902665615081787, "learning_rate": 4.9031542398457974e-05, "loss": 3.4764, "step": 18 }, { "epoch": 0.015625, "eval_loss": 0.8059016466140747, "eval_runtime": 133.0467, "eval_samples_per_second": 14.581, "eval_steps_per_second": 1.826, "step": 18 }, { "epoch": 0.018229166666666668, "grad_norm": 4.092312812805176, "learning_rate": 4.817959636416969e-05, "loss": 3.1563, "step": 21 }, { "epoch": 0.020833333333333332, "grad_norm": 4.029233455657959, "learning_rate": 4.707368982147318e-05, "loss": 2.8557, "step": 24 }, { "epoch": 0.0234375, "grad_norm": 3.739703416824341, "learning_rate": 4.572593931387604e-05, "loss": 2.7301, "step": 27 }, { "epoch": 0.0234375, "eval_loss": 0.627487063407898, "eval_runtime": 133.024, "eval_samples_per_second": 14.584, "eval_steps_per_second": 1.827, "step": 27 }, { "epoch": 0.026041666666666668, "grad_norm": 3.982668161392212, "learning_rate": 4.415111107797445e-05, "loss": 2.4342, "step": 30 }, { "epoch": 0.028645833333333332, "grad_norm": 3.4705731868743896, "learning_rate": 4.2366459261474933e-05, "loss": 2.236, "step": 33 }, { "epoch": 0.03125, "grad_norm": 3.903174638748169, "learning_rate": 4.039153688314145e-05, "loss": 2.1105, "step": 36 }, { "epoch": 0.03125, "eval_loss": 0.5284584760665894, "eval_runtime": 133.1079, "eval_samples_per_second": 14.575, "eval_steps_per_second": 1.826, "step": 36 }, { "epoch": 0.033854166666666664, "grad_norm": 3.955035448074341, "learning_rate": 3.824798160583012e-05, "loss": 2.1069, "step": 39 }, { "epoch": 0.036458333333333336, "grad_norm": 4.207739353179932, "learning_rate": 3.5959278669726935e-05, "loss": 1.9208, "step": 42 }, { "epoch": 0.0390625, "grad_norm": 4.347591400146484, "learning_rate": 3.355050358314172e-05, "loss": 1.9558, "step": 45 }, { "epoch": 0.0390625, "eval_loss": 0.46904227137565613, "eval_runtime": 133.003, "eval_samples_per_second": 14.586, "eval_steps_per_second": 1.827, "step": 45 }, { "epoch": 0.041666666666666664, "grad_norm": 4.274533271789551, "learning_rate": 3.104804738999169e-05, "loss": 1.8352, "step": 48 }, { "epoch": 0.044270833333333336, "grad_norm": 3.8483474254608154, "learning_rate": 2.8479327524001636e-05, "loss": 1.7612, "step": 51 }, { "epoch": 0.046875, "grad_norm": 4.284761428833008, "learning_rate": 2.587248741756253e-05, "loss": 1.8031, "step": 54 }, { "epoch": 0.046875, "eval_loss": 0.43528732657432556, "eval_runtime": 132.9965, "eval_samples_per_second": 14.587, "eval_steps_per_second": 1.827, "step": 54 }, { "epoch": 0.049479166666666664, "grad_norm": 4.528855323791504, "learning_rate": 2.3256088156396868e-05, "loss": 1.6669, "step": 57 }, { "epoch": 0.052083333333333336, "grad_norm": 4.008581638336182, "learning_rate": 2.0658795558326743e-05, "loss": 1.7404, "step": 60 }, { "epoch": 0.0546875, "grad_norm": 4.258989334106445, "learning_rate": 1.8109066104575023e-05, "loss": 1.6657, "step": 63 }, { "epoch": 0.0546875, "eval_loss": 0.4117554724216461, "eval_runtime": 133.0085, "eval_samples_per_second": 14.586, "eval_steps_per_second": 1.827, "step": 63 }, { "epoch": 0.057291666666666664, "grad_norm": 4.279396057128906, "learning_rate": 1.56348351646022e-05, "loss": 1.6728, "step": 66 }, { "epoch": 0.059895833333333336, "grad_norm": 4.159554958343506, "learning_rate": 1.3263210930352737e-05, "loss": 1.6461, "step": 69 }, { "epoch": 0.0625, "grad_norm": 4.362893104553223, "learning_rate": 1.1020177413231334e-05, "loss": 1.5333, "step": 72 }, { "epoch": 0.0625, "eval_loss": 0.39763975143432617, "eval_runtime": 132.9302, "eval_samples_per_second": 14.594, "eval_steps_per_second": 1.828, "step": 72 }, { "epoch": 0.06510416666666667, "grad_norm": 4.511038780212402, "learning_rate": 8.930309757836517e-06, "loss": 1.5358, "step": 75 }, { "epoch": 0.06770833333333333, "grad_norm": 4.454962253570557, "learning_rate": 7.016504991533726e-06, "loss": 1.5103, "step": 78 }, { "epoch": 0.0703125, "grad_norm": 4.400078773498535, "learning_rate": 5.299731159831953e-06, "loss": 1.478, "step": 81 }, { "epoch": 0.0703125, "eval_loss": 0.3869146704673767, "eval_runtime": 133.0114, "eval_samples_per_second": 14.585, "eval_steps_per_second": 1.827, "step": 81 }, { "epoch": 0.07291666666666667, "grad_norm": 4.320717811584473, "learning_rate": 3.798797596089351e-06, "loss": 1.4953, "step": 84 }, { "epoch": 0.07552083333333333, "grad_norm": 4.3952789306640625, "learning_rate": 2.5301488425208296e-06, "loss": 1.5565, "step": 87 }, { "epoch": 0.078125, "grad_norm": 3.9055593013763428, "learning_rate": 1.5076844803522922e-06, "loss": 1.5535, "step": 90 }, { "epoch": 0.078125, "eval_loss": 0.3823155462741852, "eval_runtime": 132.9844, "eval_samples_per_second": 14.588, "eval_steps_per_second": 1.827, "step": 90 }, { "epoch": 0.08072916666666667, "grad_norm": 4.043277740478516, "learning_rate": 7.426068431000882e-07, "loss": 1.5073, "step": 93 }, { "epoch": 0.08333333333333333, "grad_norm": 4.137203693389893, "learning_rate": 2.4329828146074095e-07, "loss": 1.5309, "step": 96 }, { "epoch": 0.0859375, "grad_norm": 4.494436740875244, "learning_rate": 1.522932452260595e-08, "loss": 1.4219, "step": 99 }, { "epoch": 0.0859375, "eval_loss": 0.3812783360481262, "eval_runtime": 132.9892, "eval_samples_per_second": 14.588, "eval_steps_per_second": 1.827, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.406258997362688e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }