{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11363636363636363, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011363636363636363, "grad_norm": 0.5031628012657166, "learning_rate": 1e-05, "loss": 1.6175, "step": 1 }, { "epoch": 0.0011363636363636363, "eval_loss": 1.7302374839782715, "eval_runtime": 88.1623, "eval_samples_per_second": 8.405, "eval_steps_per_second": 1.055, "step": 1 }, { "epoch": 0.0022727272727272726, "grad_norm": 0.4654340147972107, "learning_rate": 2e-05, "loss": 2.1047, "step": 2 }, { "epoch": 0.003409090909090909, "grad_norm": 0.5036187171936035, "learning_rate": 3e-05, "loss": 1.7235, "step": 3 }, { "epoch": 0.004545454545454545, "grad_norm": 0.41818612813949585, "learning_rate": 4e-05, "loss": 1.3914, "step": 4 }, { "epoch": 0.005681818181818182, "grad_norm": 0.4234386086463928, "learning_rate": 5e-05, "loss": 2.0059, "step": 5 }, { "epoch": 0.006818181818181818, "grad_norm": 0.45015767216682434, "learning_rate": 6e-05, "loss": 1.6938, "step": 6 }, { "epoch": 0.007954545454545454, "grad_norm": 0.4883180856704712, "learning_rate": 7e-05, "loss": 1.866, "step": 7 }, { "epoch": 0.00909090909090909, "grad_norm": 0.8753936886787415, "learning_rate": 8e-05, "loss": 1.735, "step": 8 }, { "epoch": 0.010227272727272727, "grad_norm": 0.43279698491096497, "learning_rate": 9e-05, "loss": 1.1927, "step": 9 }, { "epoch": 0.010227272727272727, "eval_loss": 1.6918625831604004, "eval_runtime": 87.4761, "eval_samples_per_second": 8.471, "eval_steps_per_second": 1.063, "step": 9 }, { "epoch": 0.011363636363636364, "grad_norm": 0.5074693560600281, "learning_rate": 0.0001, "loss": 1.6828, "step": 10 }, { "epoch": 0.0125, "grad_norm": 0.5221801996231079, "learning_rate": 9.99695413509548e-05, "loss": 1.9807, "step": 11 }, { "epoch": 0.013636363636363636, "grad_norm": 1.1368098258972168, "learning_rate": 9.987820251299122e-05, "loss": 1.5947, "step": 12 }, { "epoch": 0.014772727272727272, "grad_norm": 0.7387529611587524, "learning_rate": 9.972609476841367e-05, "loss": 1.7278, "step": 13 }, { "epoch": 0.015909090909090907, "grad_norm": 0.5394375920295715, "learning_rate": 9.951340343707852e-05, "loss": 1.3158, "step": 14 }, { "epoch": 0.017045454545454544, "grad_norm": 0.4821315109729767, "learning_rate": 9.924038765061042e-05, "loss": 1.822, "step": 15 }, { "epoch": 0.01818181818181818, "grad_norm": 0.38951918482780457, "learning_rate": 9.890738003669029e-05, "loss": 1.5332, "step": 16 }, { "epoch": 0.019318181818181818, "grad_norm": 0.7511935234069824, "learning_rate": 9.851478631379982e-05, "loss": 1.5436, "step": 17 }, { "epoch": 0.020454545454545454, "grad_norm": 0.6894105672836304, "learning_rate": 9.806308479691595e-05, "loss": 0.9722, "step": 18 }, { "epoch": 0.020454545454545454, "eval_loss": 1.539093017578125, "eval_runtime": 87.4243, "eval_samples_per_second": 8.476, "eval_steps_per_second": 1.064, "step": 18 }, { "epoch": 0.02159090909090909, "grad_norm": 0.5717121362686157, "learning_rate": 9.755282581475769e-05, "loss": 1.3682, "step": 19 }, { "epoch": 0.022727272727272728, "grad_norm": 0.49914515018463135, "learning_rate": 9.698463103929542e-05, "loss": 1.5274, "step": 20 }, { "epoch": 0.023863636363636365, "grad_norm": 0.5184929370880127, "learning_rate": 9.635919272833938e-05, "loss": 1.4419, "step": 21 }, { "epoch": 0.025, "grad_norm": 0.8946555852890015, "learning_rate": 9.567727288213005e-05, "loss": 1.7122, "step": 22 }, { "epoch": 0.026136363636363635, "grad_norm": 0.6487534046173096, "learning_rate": 9.493970231495835e-05, "loss": 1.5038, "step": 23 }, { "epoch": 0.02727272727272727, "grad_norm": 0.6374855637550354, "learning_rate": 9.414737964294636e-05, "loss": 1.5951, "step": 24 }, { "epoch": 0.028409090909090908, "grad_norm": 0.7161133289337158, "learning_rate": 9.330127018922194e-05, "loss": 1.6346, "step": 25 }, { "epoch": 0.029545454545454545, "grad_norm": 0.48917874693870544, "learning_rate": 9.24024048078213e-05, "loss": 1.154, "step": 26 }, { "epoch": 0.03068181818181818, "grad_norm": 0.6666057705879211, "learning_rate": 9.145187862775209e-05, "loss": 1.778, "step": 27 }, { "epoch": 0.03068181818181818, "eval_loss": 1.4715248346328735, "eval_runtime": 87.4639, "eval_samples_per_second": 8.472, "eval_steps_per_second": 1.063, "step": 27 }, { "epoch": 0.031818181818181815, "grad_norm": 0.5019454956054688, "learning_rate": 9.045084971874738e-05, "loss": 1.7373, "step": 28 }, { "epoch": 0.03295454545454545, "grad_norm": 0.514089047908783, "learning_rate": 8.940053768033609e-05, "loss": 0.9394, "step": 29 }, { "epoch": 0.03409090909090909, "grad_norm": 0.4315960705280304, "learning_rate": 8.83022221559489e-05, "loss": 1.6312, "step": 30 }, { "epoch": 0.035227272727272725, "grad_norm": 0.6210121512413025, "learning_rate": 8.715724127386972e-05, "loss": 1.6856, "step": 31 }, { "epoch": 0.03636363636363636, "grad_norm": 0.86749267578125, "learning_rate": 8.596699001693255e-05, "loss": 1.6499, "step": 32 }, { "epoch": 0.0375, "grad_norm": 0.5921334624290466, "learning_rate": 8.473291852294987e-05, "loss": 1.5508, "step": 33 }, { "epoch": 0.038636363636363635, "grad_norm": 0.5376932621002197, "learning_rate": 8.345653031794292e-05, "loss": 2.0017, "step": 34 }, { "epoch": 0.03977272727272727, "grad_norm": 0.762406587600708, "learning_rate": 8.213938048432697e-05, "loss": 1.372, "step": 35 }, { "epoch": 0.04090909090909091, "grad_norm": 0.5415381193161011, "learning_rate": 8.07830737662829e-05, "loss": 1.5537, "step": 36 }, { "epoch": 0.04090909090909091, "eval_loss": 1.4426989555358887, "eval_runtime": 87.3876, "eval_samples_per_second": 8.479, "eval_steps_per_second": 1.064, "step": 36 }, { "epoch": 0.042045454545454546, "grad_norm": 0.7806885242462158, "learning_rate": 7.938926261462366e-05, "loss": 1.5561, "step": 37 }, { "epoch": 0.04318181818181818, "grad_norm": 0.7586813569068909, "learning_rate": 7.795964517353735e-05, "loss": 1.3102, "step": 38 }, { "epoch": 0.04431818181818182, "grad_norm": 0.6596840620040894, "learning_rate": 7.649596321166024e-05, "loss": 1.2605, "step": 39 }, { "epoch": 0.045454545454545456, "grad_norm": 0.5476572513580322, "learning_rate": 7.500000000000001e-05, "loss": 1.4152, "step": 40 }, { "epoch": 0.04659090909090909, "grad_norm": 0.6413193345069885, "learning_rate": 7.347357813929454e-05, "loss": 1.8048, "step": 41 }, { "epoch": 0.04772727272727273, "grad_norm": 0.4673216938972473, "learning_rate": 7.191855733945387e-05, "loss": 1.6345, "step": 42 }, { "epoch": 0.048863636363636366, "grad_norm": 0.5007254481315613, "learning_rate": 7.033683215379002e-05, "loss": 1.247, "step": 43 }, { "epoch": 0.05, "grad_norm": 0.5497366189956665, "learning_rate": 6.873032967079561e-05, "loss": 1.7314, "step": 44 }, { "epoch": 0.05113636363636364, "grad_norm": 0.4534544348716736, "learning_rate": 6.710100716628344e-05, "loss": 1.3849, "step": 45 }, { "epoch": 0.05113636363636364, "eval_loss": 1.4240630865097046, "eval_runtime": 87.401, "eval_samples_per_second": 8.478, "eval_steps_per_second": 1.064, "step": 45 }, { "epoch": 0.05227272727272727, "grad_norm": 0.4592258036136627, "learning_rate": 6.545084971874738e-05, "loss": 1.7077, "step": 46 }, { "epoch": 0.053409090909090906, "grad_norm": 0.6697846055030823, "learning_rate": 6.378186779084995e-05, "loss": 1.6171, "step": 47 }, { "epoch": 0.05454545454545454, "grad_norm": 0.5928575396537781, "learning_rate": 6.209609477998338e-05, "loss": 1.7667, "step": 48 }, { "epoch": 0.05568181818181818, "grad_norm": 0.6087995767593384, "learning_rate": 6.0395584540887963e-05, "loss": 1.4706, "step": 49 }, { "epoch": 0.056818181818181816, "grad_norm": 0.728308379650116, "learning_rate": 5.868240888334653e-05, "loss": 0.9524, "step": 50 }, { "epoch": 0.05795454545454545, "grad_norm": 0.45047634840011597, "learning_rate": 5.695865504800327e-05, "loss": 1.4548, "step": 51 }, { "epoch": 0.05909090909090909, "grad_norm": 1.1415973901748657, "learning_rate": 5.522642316338268e-05, "loss": 1.2522, "step": 52 }, { "epoch": 0.060227272727272727, "grad_norm": 0.5474623441696167, "learning_rate": 5.348782368720626e-05, "loss": 1.5022, "step": 53 }, { "epoch": 0.06136363636363636, "grad_norm": 0.5780206322669983, "learning_rate": 5.174497483512506e-05, "loss": 1.3892, "step": 54 }, { "epoch": 0.06136363636363636, "eval_loss": 1.4119524955749512, "eval_runtime": 87.4373, "eval_samples_per_second": 8.475, "eval_steps_per_second": 1.064, "step": 54 }, { "epoch": 0.0625, "grad_norm": 0.5161088109016418, "learning_rate": 5e-05, "loss": 1.3492, "step": 55 }, { "epoch": 0.06363636363636363, "grad_norm": 0.461467981338501, "learning_rate": 4.825502516487497e-05, "loss": 1.5803, "step": 56 }, { "epoch": 0.06477272727272727, "grad_norm": 0.7066351771354675, "learning_rate": 4.6512176312793736e-05, "loss": 1.4825, "step": 57 }, { "epoch": 0.0659090909090909, "grad_norm": 0.7307711839675903, "learning_rate": 4.477357683661734e-05, "loss": 1.7179, "step": 58 }, { "epoch": 0.06704545454545455, "grad_norm": 0.383287250995636, "learning_rate": 4.3041344951996746e-05, "loss": 1.4907, "step": 59 }, { "epoch": 0.06818181818181818, "grad_norm": 0.7558887004852295, "learning_rate": 4.131759111665349e-05, "loss": 1.5637, "step": 60 }, { "epoch": 0.06931818181818182, "grad_norm": 0.34086090326309204, "learning_rate": 3.960441545911204e-05, "loss": 1.5766, "step": 61 }, { "epoch": 0.07045454545454545, "grad_norm": 0.6929436922073364, "learning_rate": 3.790390522001662e-05, "loss": 1.5548, "step": 62 }, { "epoch": 0.0715909090909091, "grad_norm": 0.4506044089794159, "learning_rate": 3.6218132209150045e-05, "loss": 1.7352, "step": 63 }, { "epoch": 0.0715909090909091, "eval_loss": 1.4060667753219604, "eval_runtime": 87.4035, "eval_samples_per_second": 8.478, "eval_steps_per_second": 1.064, "step": 63 }, { "epoch": 0.07272727272727272, "grad_norm": 0.47386038303375244, "learning_rate": 3.4549150281252636e-05, "loss": 1.4088, "step": 64 }, { "epoch": 0.07386363636363637, "grad_norm": 0.8597636818885803, "learning_rate": 3.289899283371657e-05, "loss": 1.0096, "step": 65 }, { "epoch": 0.075, "grad_norm": 0.5897632241249084, "learning_rate": 3.12696703292044e-05, "loss": 1.6263, "step": 66 }, { "epoch": 0.07613636363636364, "grad_norm": 0.800680935382843, "learning_rate": 2.9663167846209998e-05, "loss": 1.4472, "step": 67 }, { "epoch": 0.07727272727272727, "grad_norm": 0.8336355090141296, "learning_rate": 2.8081442660546125e-05, "loss": 1.4691, "step": 68 }, { "epoch": 0.07840909090909091, "grad_norm": 0.8036491870880127, "learning_rate": 2.6526421860705473e-05, "loss": 1.573, "step": 69 }, { "epoch": 0.07954545454545454, "grad_norm": 0.551456093788147, "learning_rate": 2.500000000000001e-05, "loss": 1.8588, "step": 70 }, { "epoch": 0.08068181818181819, "grad_norm": 0.5478857755661011, "learning_rate": 2.350403678833976e-05, "loss": 1.8197, "step": 71 }, { "epoch": 0.08181818181818182, "grad_norm": 1.3312339782714844, "learning_rate": 2.2040354826462668e-05, "loss": 0.9631, "step": 72 }, { "epoch": 0.08181818181818182, "eval_loss": 1.4008089303970337, "eval_runtime": 87.397, "eval_samples_per_second": 8.479, "eval_steps_per_second": 1.064, "step": 72 }, { "epoch": 0.08295454545454546, "grad_norm": 0.47865477204322815, "learning_rate": 2.061073738537635e-05, "loss": 1.6158, "step": 73 }, { "epoch": 0.08409090909090909, "grad_norm": 0.5435898900032043, "learning_rate": 1.9216926233717085e-05, "loss": 1.6159, "step": 74 }, { "epoch": 0.08522727272727272, "grad_norm": 0.589959442615509, "learning_rate": 1.7860619515673033e-05, "loss": 1.1458, "step": 75 }, { "epoch": 0.08636363636363636, "grad_norm": 0.6192647218704224, "learning_rate": 1.6543469682057106e-05, "loss": 1.3072, "step": 76 }, { "epoch": 0.0875, "grad_norm": 0.555339515209198, "learning_rate": 1.526708147705013e-05, "loss": 1.7459, "step": 77 }, { "epoch": 0.08863636363636364, "grad_norm": 0.402619868516922, "learning_rate": 1.4033009983067452e-05, "loss": 1.3941, "step": 78 }, { "epoch": 0.08977272727272727, "grad_norm": 0.4111030697822571, "learning_rate": 1.2842758726130283e-05, "loss": 1.1667, "step": 79 }, { "epoch": 0.09090909090909091, "grad_norm": 0.6310803294181824, "learning_rate": 1.1697777844051105e-05, "loss": 1.5854, "step": 80 }, { "epoch": 0.09204545454545454, "grad_norm": 0.4028031527996063, "learning_rate": 1.0599462319663905e-05, "loss": 1.4636, "step": 81 }, { "epoch": 0.09204545454545454, "eval_loss": 1.3986175060272217, "eval_runtime": 87.4361, "eval_samples_per_second": 8.475, "eval_steps_per_second": 1.064, "step": 81 }, { "epoch": 0.09318181818181819, "grad_norm": 0.5828585624694824, "learning_rate": 9.549150281252633e-06, "loss": 1.9201, "step": 82 }, { "epoch": 0.09431818181818181, "grad_norm": 0.5383720397949219, "learning_rate": 8.548121372247918e-06, "loss": 1.3849, "step": 83 }, { "epoch": 0.09545454545454546, "grad_norm": 0.5691014528274536, "learning_rate": 7.597595192178702e-06, "loss": 0.9816, "step": 84 }, { "epoch": 0.09659090909090909, "grad_norm": 0.4643211364746094, "learning_rate": 6.698729810778065e-06, "loss": 1.5215, "step": 85 }, { "epoch": 0.09772727272727273, "grad_norm": 0.5514132380485535, "learning_rate": 5.852620357053651e-06, "loss": 1.0381, "step": 86 }, { "epoch": 0.09886363636363636, "grad_norm": 0.3321930766105652, "learning_rate": 5.060297685041659e-06, "loss": 1.0676, "step": 87 }, { "epoch": 0.1, "grad_norm": 0.7636975646018982, "learning_rate": 4.322727117869951e-06, "loss": 1.4628, "step": 88 }, { "epoch": 0.10113636363636364, "grad_norm": 0.6969127655029297, "learning_rate": 3.6408072716606346e-06, "loss": 1.4346, "step": 89 }, { "epoch": 0.10227272727272728, "grad_norm": 0.6535035371780396, "learning_rate": 3.0153689607045845e-06, "loss": 1.2264, "step": 90 }, { "epoch": 0.10227272727272728, "eval_loss": 1.3976737260818481, "eval_runtime": 87.4171, "eval_samples_per_second": 8.477, "eval_steps_per_second": 1.064, "step": 90 }, { "epoch": 0.10340909090909091, "grad_norm": 0.4957861006259918, "learning_rate": 2.4471741852423237e-06, "loss": 1.5694, "step": 91 }, { "epoch": 0.10454545454545454, "grad_norm": 0.5834468007087708, "learning_rate": 1.9369152030840556e-06, "loss": 1.157, "step": 92 }, { "epoch": 0.10568181818181818, "grad_norm": 0.6812298893928528, "learning_rate": 1.4852136862001764e-06, "loss": 1.6588, "step": 93 }, { "epoch": 0.10681818181818181, "grad_norm": 0.46779105067253113, "learning_rate": 1.0926199633097157e-06, "loss": 1.7968, "step": 94 }, { "epoch": 0.10795454545454546, "grad_norm": 0.5582041144371033, "learning_rate": 7.596123493895991e-07, "loss": 1.7294, "step": 95 }, { "epoch": 0.10909090909090909, "grad_norm": 0.5326647758483887, "learning_rate": 4.865965629214819e-07, "loss": 1.5638, "step": 96 }, { "epoch": 0.11022727272727273, "grad_norm": 0.5757723450660706, "learning_rate": 2.7390523158633554e-07, "loss": 1.5188, "step": 97 }, { "epoch": 0.11136363636363636, "grad_norm": 0.8716908693313599, "learning_rate": 1.2179748700879012e-07, "loss": 1.819, "step": 98 }, { "epoch": 0.1125, "grad_norm": 0.588882327079773, "learning_rate": 3.04586490452119e-08, "loss": 1.8313, "step": 99 }, { "epoch": 0.1125, "eval_loss": 1.3972293138504028, "eval_runtime": 87.4494, "eval_samples_per_second": 8.473, "eval_steps_per_second": 1.063, "step": 99 }, { "epoch": 0.11363636363636363, "grad_norm": 0.522475004196167, "learning_rate": 0.0, "loss": 1.5229, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.470363520794624e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }