{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2518891687657431, "eval_steps": 4, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005037783375314861, "grad_norm": 0.1343812644481659, "learning_rate": 2e-05, "loss": 11.7643, "step": 1 }, { "epoch": 0.005037783375314861, "eval_loss": 11.775632858276367, "eval_runtime": 0.6521, "eval_samples_per_second": 128.824, "eval_steps_per_second": 64.412, "step": 1 }, { "epoch": 0.010075566750629723, "grad_norm": 0.17478109896183014, "learning_rate": 4e-05, "loss": 11.7898, "step": 2 }, { "epoch": 0.015113350125944584, "grad_norm": 0.17163191735744476, "learning_rate": 6e-05, "loss": 11.7762, "step": 3 }, { "epoch": 0.020151133501259445, "grad_norm": 0.1776304692029953, "learning_rate": 8e-05, "loss": 11.7739, "step": 4 }, { "epoch": 0.020151133501259445, "eval_loss": 11.775409698486328, "eval_runtime": 0.6545, "eval_samples_per_second": 128.341, "eval_steps_per_second": 64.17, "step": 4 }, { "epoch": 0.02518891687657431, "grad_norm": 0.20767977833747864, "learning_rate": 0.0001, "loss": 11.7652, "step": 5 }, { "epoch": 0.030226700251889168, "grad_norm": 0.1783723384141922, "learning_rate": 0.00012, "loss": 11.7811, "step": 6 }, { "epoch": 0.03526448362720403, "grad_norm": 0.2075798362493515, "learning_rate": 0.00014, "loss": 11.7613, "step": 7 }, { "epoch": 0.04030226700251889, "grad_norm": 0.17259609699249268, "learning_rate": 0.00016, "loss": 11.7766, "step": 8 }, { "epoch": 0.04030226700251889, "eval_loss": 11.7745943069458, "eval_runtime": 0.6545, "eval_samples_per_second": 128.344, "eval_steps_per_second": 64.172, "step": 8 }, { "epoch": 0.04534005037783375, "grad_norm": 0.16603446006774902, "learning_rate": 0.00018, "loss": 11.7747, "step": 9 }, { "epoch": 0.05037783375314862, "grad_norm": 0.16154415905475616, "learning_rate": 0.0002, "loss": 11.7738, "step": 10 }, { "epoch": 0.055415617128463476, "grad_norm": 0.21826648712158203, "learning_rate": 0.0001996917333733128, "loss": 11.7761, "step": 11 }, { "epoch": 0.060453400503778336, "grad_norm": 0.2048770934343338, "learning_rate": 0.00019876883405951377, "loss": 11.7763, "step": 12 }, { "epoch": 0.060453400503778336, "eval_loss": 11.773273468017578, "eval_runtime": 0.6781, "eval_samples_per_second": 123.871, "eval_steps_per_second": 61.935, "step": 12 }, { "epoch": 0.0654911838790932, "grad_norm": 0.12033865600824356, "learning_rate": 0.00019723699203976766, "loss": 11.7726, "step": 13 }, { "epoch": 0.07052896725440806, "grad_norm": 0.1583612710237503, "learning_rate": 0.00019510565162951537, "loss": 11.772, "step": 14 }, { "epoch": 0.07556675062972293, "grad_norm": 0.1649715006351471, "learning_rate": 0.0001923879532511287, "loss": 11.7676, "step": 15 }, { "epoch": 0.08060453400503778, "grad_norm": 0.1502542793750763, "learning_rate": 0.0001891006524188368, "loss": 11.7657, "step": 16 }, { "epoch": 0.08060453400503778, "eval_loss": 11.771785736083984, "eval_runtime": 0.6579, "eval_samples_per_second": 127.688, "eval_steps_per_second": 63.844, "step": 16 }, { "epoch": 0.08564231738035265, "grad_norm": 0.15804621577262878, "learning_rate": 0.00018526401643540922, "loss": 11.7648, "step": 17 }, { "epoch": 0.0906801007556675, "grad_norm": 0.1290988177061081, "learning_rate": 0.00018090169943749476, "loss": 11.7736, "step": 18 }, { "epoch": 0.09571788413098237, "grad_norm": 0.15611040592193604, "learning_rate": 0.0001760405965600031, "loss": 11.7751, "step": 19 }, { "epoch": 0.10075566750629723, "grad_norm": 0.1558377742767334, "learning_rate": 0.00017071067811865476, "loss": 11.7821, "step": 20 }, { "epoch": 0.10075566750629723, "eval_loss": 11.77033805847168, "eval_runtime": 0.6652, "eval_samples_per_second": 126.274, "eval_steps_per_second": 63.137, "step": 20 }, { "epoch": 0.10579345088161209, "grad_norm": 0.14136968553066254, "learning_rate": 0.00016494480483301836, "loss": 11.7598, "step": 21 }, { "epoch": 0.11083123425692695, "grad_norm": 0.1827390193939209, "learning_rate": 0.00015877852522924732, "loss": 11.7637, "step": 22 }, { "epoch": 0.11586901763224182, "grad_norm": 0.146404430270195, "learning_rate": 0.0001522498564715949, "loss": 11.7784, "step": 23 }, { "epoch": 0.12090680100755667, "grad_norm": 0.27841717004776, "learning_rate": 0.00014539904997395468, "loss": 11.7707, "step": 24 }, { "epoch": 0.12090680100755667, "eval_loss": 11.768901824951172, "eval_runtime": 0.6686, "eval_samples_per_second": 125.642, "eval_steps_per_second": 62.821, "step": 24 }, { "epoch": 0.12594458438287154, "grad_norm": 0.15636301040649414, "learning_rate": 0.000138268343236509, "loss": 11.7764, "step": 25 }, { "epoch": 0.1309823677581864, "grad_norm": 0.17114318907260895, "learning_rate": 0.00013090169943749476, "loss": 11.7703, "step": 26 }, { "epoch": 0.13602015113350127, "grad_norm": 0.20548300445079803, "learning_rate": 0.00012334453638559057, "loss": 11.7773, "step": 27 }, { "epoch": 0.14105793450881612, "grad_norm": 0.15028426051139832, "learning_rate": 0.0001156434465040231, "loss": 11.7642, "step": 28 }, { "epoch": 0.14105793450881612, "eval_loss": 11.767555236816406, "eval_runtime": 0.6574, "eval_samples_per_second": 127.78, "eval_steps_per_second": 63.89, "step": 28 }, { "epoch": 0.14609571788413098, "grad_norm": 0.13849467039108276, "learning_rate": 0.0001078459095727845, "loss": 11.7754, "step": 29 }, { "epoch": 0.15113350125944586, "grad_norm": 0.24436236917972565, "learning_rate": 0.0001, "loss": 11.777, "step": 30 }, { "epoch": 0.1561712846347607, "grad_norm": 0.12146396189928055, "learning_rate": 9.215409042721552e-05, "loss": 11.7583, "step": 31 }, { "epoch": 0.16120906801007556, "grad_norm": 0.13603943586349487, "learning_rate": 8.435655349597689e-05, "loss": 11.7767, "step": 32 }, { "epoch": 0.16120906801007556, "eval_loss": 11.76652717590332, "eval_runtime": 0.6557, "eval_samples_per_second": 128.109, "eval_steps_per_second": 64.054, "step": 32 }, { "epoch": 0.16624685138539042, "grad_norm": 0.1841493397951126, "learning_rate": 7.66554636144095e-05, "loss": 11.7572, "step": 33 }, { "epoch": 0.1712846347607053, "grad_norm": 0.1587410420179367, "learning_rate": 6.909830056250527e-05, "loss": 11.7721, "step": 34 }, { "epoch": 0.17632241813602015, "grad_norm": 0.1748318076133728, "learning_rate": 6.173165676349103e-05, "loss": 11.7697, "step": 35 }, { "epoch": 0.181360201511335, "grad_norm": 0.15539947152137756, "learning_rate": 5.4600950026045326e-05, "loss": 11.7722, "step": 36 }, { "epoch": 0.181360201511335, "eval_loss": 11.765742301940918, "eval_runtime": 0.6575, "eval_samples_per_second": 127.75, "eval_steps_per_second": 63.875, "step": 36 }, { "epoch": 0.18639798488664988, "grad_norm": 0.1843356043100357, "learning_rate": 4.7750143528405126e-05, "loss": 11.7612, "step": 37 }, { "epoch": 0.19143576826196473, "grad_norm": 0.14512236416339874, "learning_rate": 4.12214747707527e-05, "loss": 11.7649, "step": 38 }, { "epoch": 0.1964735516372796, "grad_norm": 0.14143380522727966, "learning_rate": 3.5055195166981645e-05, "loss": 11.7659, "step": 39 }, { "epoch": 0.20151133501259447, "grad_norm": 0.15115749835968018, "learning_rate": 2.9289321881345254e-05, "loss": 11.7692, "step": 40 }, { "epoch": 0.20151133501259447, "eval_loss": 11.76524829864502, "eval_runtime": 0.6559, "eval_samples_per_second": 128.077, "eval_steps_per_second": 64.039, "step": 40 }, { "epoch": 0.20654911838790932, "grad_norm": 0.16189849376678467, "learning_rate": 2.3959403439996907e-05, "loss": 11.7604, "step": 41 }, { "epoch": 0.21158690176322417, "grad_norm": 0.15517611801624298, "learning_rate": 1.9098300562505266e-05, "loss": 11.7661, "step": 42 }, { "epoch": 0.21662468513853905, "grad_norm": 0.15952840447425842, "learning_rate": 1.4735983564590783e-05, "loss": 11.7654, "step": 43 }, { "epoch": 0.2216624685138539, "grad_norm": 0.18206751346588135, "learning_rate": 1.0899347581163221e-05, "loss": 11.7605, "step": 44 }, { "epoch": 0.2216624685138539, "eval_loss": 11.764971733093262, "eval_runtime": 0.6686, "eval_samples_per_second": 125.638, "eval_steps_per_second": 62.819, "step": 44 }, { "epoch": 0.22670025188916876, "grad_norm": 0.14965149760246277, "learning_rate": 7.612046748871327e-06, "loss": 11.7675, "step": 45 }, { "epoch": 0.23173803526448364, "grad_norm": 0.16042684018611908, "learning_rate": 4.8943483704846475e-06, "loss": 11.7729, "step": 46 }, { "epoch": 0.2367758186397985, "grad_norm": 0.2848449945449829, "learning_rate": 2.7630079602323442e-06, "loss": 11.7682, "step": 47 }, { "epoch": 0.24181360201511334, "grad_norm": 0.16414609551429749, "learning_rate": 1.231165940486234e-06, "loss": 11.7582, "step": 48 }, { "epoch": 0.24181360201511334, "eval_loss": 11.764909744262695, "eval_runtime": 0.6556, "eval_samples_per_second": 128.135, "eval_steps_per_second": 64.068, "step": 48 }, { "epoch": 0.24685138539042822, "grad_norm": 0.16592900454998016, "learning_rate": 3.0826662668720364e-07, "loss": 11.7754, "step": 49 }, { "epoch": 0.2518891687657431, "grad_norm": 0.19010961055755615, "learning_rate": 0.0, "loss": 11.7793, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 10185002188800.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }