{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0012921482610914776, "eval_steps": 5, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.5842965221829554e-05, "grad_norm": 3.3958845138549805, "learning_rate": 2e-05, "loss": 1.4244, "step": 1 }, { "epoch": 2.5842965221829554e-05, "eval_loss": 2.1125328540802, "eval_runtime": 1391.8533, "eval_samples_per_second": 11.706, "eval_steps_per_second": 5.853, "step": 1 }, { "epoch": 5.168593044365911e-05, "grad_norm": 2.459746837615967, "learning_rate": 4e-05, "loss": 1.5737, "step": 2 }, { "epoch": 7.752889566548866e-05, "grad_norm": 2.5988481044769287, "learning_rate": 6e-05, "loss": 1.2215, "step": 3 }, { "epoch": 0.00010337186088731822, "grad_norm": 2.9697864055633545, "learning_rate": 8e-05, "loss": 1.3948, "step": 4 }, { "epoch": 0.00012921482610914775, "grad_norm": 3.4510738849639893, "learning_rate": 0.0001, "loss": 3.3687, "step": 5 }, { "epoch": 0.00012921482610914775, "eval_loss": 1.8981174230575562, "eval_runtime": 1395.5569, "eval_samples_per_second": 11.675, "eval_steps_per_second": 5.838, "step": 5 }, { "epoch": 0.0001550577913309773, "grad_norm": 2.285410165786743, "learning_rate": 0.00012, "loss": 1.0475, "step": 6 }, { "epoch": 0.00018090075655280687, "grad_norm": 3.462702751159668, "learning_rate": 0.00014, "loss": 1.0077, "step": 7 }, { "epoch": 0.00020674372177463643, "grad_norm": 2.645705223083496, "learning_rate": 0.00016, "loss": 0.8847, "step": 8 }, { "epoch": 0.00023258668699646597, "grad_norm": 2.0876657962799072, "learning_rate": 0.00018, "loss": 0.7847, "step": 9 }, { "epoch": 0.0002584296522182955, "grad_norm": 2.068669319152832, "learning_rate": 0.0002, "loss": 0.6889, "step": 10 }, { "epoch": 0.0002584296522182955, "eval_loss": 1.0108299255371094, "eval_runtime": 1395.6064, "eval_samples_per_second": 11.674, "eval_steps_per_second": 5.838, "step": 10 }, { "epoch": 0.0002842726174401251, "grad_norm": 1.778637170791626, "learning_rate": 0.0001996917333733128, "loss": 0.6103, "step": 11 }, { "epoch": 0.0003101155826619546, "grad_norm": 1.7560285329818726, "learning_rate": 0.00019876883405951377, "loss": 0.4017, "step": 12 }, { "epoch": 0.0003359585478837842, "grad_norm": 1.798984169960022, "learning_rate": 0.00019723699203976766, "loss": 0.4021, "step": 13 }, { "epoch": 0.00036180151310561374, "grad_norm": 2.582660436630249, "learning_rate": 0.00019510565162951537, "loss": 0.4864, "step": 14 }, { "epoch": 0.0003876444783274433, "grad_norm": 2.107647180557251, "learning_rate": 0.0001923879532511287, "loss": 0.3102, "step": 15 }, { "epoch": 0.0003876444783274433, "eval_loss": 0.4638846814632416, "eval_runtime": 1395.4516, "eval_samples_per_second": 11.676, "eval_steps_per_second": 5.838, "step": 15 }, { "epoch": 0.00041348744354927287, "grad_norm": 2.237151622772217, "learning_rate": 0.0001891006524188368, "loss": 0.3547, "step": 16 }, { "epoch": 0.0004393304087711024, "grad_norm": 2.265132427215576, "learning_rate": 0.00018526401643540922, "loss": 0.3212, "step": 17 }, { "epoch": 0.00046517337399293193, "grad_norm": 3.5674264430999756, "learning_rate": 0.00018090169943749476, "loss": 0.3152, "step": 18 }, { "epoch": 0.0004910163392147615, "grad_norm": 1.4233182668685913, "learning_rate": 0.0001760405965600031, "loss": 0.2144, "step": 19 }, { "epoch": 0.000516859304436591, "grad_norm": 1.286632776260376, "learning_rate": 0.00017071067811865476, "loss": 0.1448, "step": 20 }, { "epoch": 0.000516859304436591, "eval_loss": 0.27176007628440857, "eval_runtime": 1395.268, "eval_samples_per_second": 11.677, "eval_steps_per_second": 5.839, "step": 20 }, { "epoch": 0.0005427022696584206, "grad_norm": 1.321890950202942, "learning_rate": 0.00016494480483301836, "loss": 0.1948, "step": 21 }, { "epoch": 0.0005685452348802502, "grad_norm": 2.3737189769744873, "learning_rate": 0.00015877852522924732, "loss": 0.2611, "step": 22 }, { "epoch": 0.0005943882001020797, "grad_norm": 1.2037267684936523, "learning_rate": 0.0001522498564715949, "loss": 0.1581, "step": 23 }, { "epoch": 0.0006202311653239092, "grad_norm": 1.1055679321289062, "learning_rate": 0.00014539904997395468, "loss": 0.1362, "step": 24 }, { "epoch": 0.0006460741305457388, "grad_norm": 0.9288361668586731, "learning_rate": 0.000138268343236509, "loss": 0.1606, "step": 25 }, { "epoch": 0.0006460741305457388, "eval_loss": 0.19134792685508728, "eval_runtime": 1395.0425, "eval_samples_per_second": 11.679, "eval_steps_per_second": 5.84, "step": 25 }, { "epoch": 0.0006719170957675684, "grad_norm": 0.7735410928726196, "learning_rate": 0.00013090169943749476, "loss": 0.1043, "step": 26 }, { "epoch": 0.000697760060989398, "grad_norm": 1.195586085319519, "learning_rate": 0.00012334453638559057, "loss": 0.1089, "step": 27 }, { "epoch": 0.0007236030262112275, "grad_norm": 1.4720776081085205, "learning_rate": 0.0001156434465040231, "loss": 0.173, "step": 28 }, { "epoch": 0.000749445991433057, "grad_norm": 1.69827401638031, "learning_rate": 0.0001078459095727845, "loss": 0.1277, "step": 29 }, { "epoch": 0.0007752889566548866, "grad_norm": 0.697060227394104, "learning_rate": 0.0001, "loss": 0.104, "step": 30 }, { "epoch": 0.0007752889566548866, "eval_loss": 0.16827471554279327, "eval_runtime": 1395.0717, "eval_samples_per_second": 11.679, "eval_steps_per_second": 5.84, "step": 30 }, { "epoch": 0.0008011319218767161, "grad_norm": 1.0948371887207031, "learning_rate": 9.215409042721552e-05, "loss": 0.1049, "step": 31 }, { "epoch": 0.0008269748870985457, "grad_norm": 2.184497594833374, "learning_rate": 8.435655349597689e-05, "loss": 0.1007, "step": 32 }, { "epoch": 0.0008528178523203753, "grad_norm": 1.1975798606872559, "learning_rate": 7.66554636144095e-05, "loss": 0.1052, "step": 33 }, { "epoch": 0.0008786608175422048, "grad_norm": 1.8608812093734741, "learning_rate": 6.909830056250527e-05, "loss": 0.1533, "step": 34 }, { "epoch": 0.0009045037827640343, "grad_norm": 0.8412917256355286, "learning_rate": 6.173165676349103e-05, "loss": 0.0917, "step": 35 }, { "epoch": 0.0009045037827640343, "eval_loss": 0.15119507908821106, "eval_runtime": 1394.8383, "eval_samples_per_second": 11.681, "eval_steps_per_second": 5.841, "step": 35 }, { "epoch": 0.0009303467479858639, "grad_norm": 0.6619991064071655, "learning_rate": 5.4600950026045326e-05, "loss": 0.1093, "step": 36 }, { "epoch": 0.0009561897132076934, "grad_norm": 0.6209729313850403, "learning_rate": 4.7750143528405126e-05, "loss": 0.06, "step": 37 }, { "epoch": 0.000982032678429523, "grad_norm": 0.650658905506134, "learning_rate": 4.12214747707527e-05, "loss": 0.0841, "step": 38 }, { "epoch": 0.0010078756436513526, "grad_norm": 0.9135183095932007, "learning_rate": 3.5055195166981645e-05, "loss": 0.1116, "step": 39 }, { "epoch": 0.001033718608873182, "grad_norm": 0.8710368275642395, "learning_rate": 2.9289321881345254e-05, "loss": 0.0838, "step": 40 }, { "epoch": 0.001033718608873182, "eval_loss": 0.142450749874115, "eval_runtime": 1394.9906, "eval_samples_per_second": 11.68, "eval_steps_per_second": 5.84, "step": 40 }, { "epoch": 0.0010595615740950116, "grad_norm": 0.9050641655921936, "learning_rate": 2.3959403439996907e-05, "loss": 0.1035, "step": 41 }, { "epoch": 0.0010854045393168413, "grad_norm": 0.877323567867279, "learning_rate": 1.9098300562505266e-05, "loss": 0.1082, "step": 42 }, { "epoch": 0.0011112475045386707, "grad_norm": 1.5448343753814697, "learning_rate": 1.4735983564590783e-05, "loss": 0.1315, "step": 43 }, { "epoch": 0.0011370904697605004, "grad_norm": 0.8033886551856995, "learning_rate": 1.0899347581163221e-05, "loss": 0.0732, "step": 44 }, { "epoch": 0.0011629334349823298, "grad_norm": 1.3498198986053467, "learning_rate": 7.612046748871327e-06, "loss": 0.1228, "step": 45 }, { "epoch": 0.0011629334349823298, "eval_loss": 0.1376151740550995, "eval_runtime": 1395.1018, "eval_samples_per_second": 11.679, "eval_steps_per_second": 5.84, "step": 45 }, { "epoch": 0.0011887764002041594, "grad_norm": 1.1750056743621826, "learning_rate": 4.8943483704846475e-06, "loss": 0.1394, "step": 46 }, { "epoch": 0.001214619365425989, "grad_norm": 2.630324363708496, "learning_rate": 2.7630079602323442e-06, "loss": 0.0545, "step": 47 }, { "epoch": 0.0012404623306478185, "grad_norm": 0.5797359347343445, "learning_rate": 1.231165940486234e-06, "loss": 0.0958, "step": 48 }, { "epoch": 0.0012663052958696481, "grad_norm": 1.0320329666137695, "learning_rate": 3.0826662668720364e-07, "loss": 0.0997, "step": 49 }, { "epoch": 0.0012921482610914776, "grad_norm": 0.5513184070587158, "learning_rate": 0.0, "loss": 0.0691, "step": 50 }, { "epoch": 0.0012921482610914776, "eval_loss": 0.1370484083890915, "eval_runtime": 1398.1486, "eval_samples_per_second": 11.653, "eval_steps_per_second": 5.827, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 70, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.84956424814592e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }