|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2518891687657431, |
|
"eval_steps": 4, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005037783375314861, |
|
"grad_norm": 0.1343812644481659, |
|
"learning_rate": 2e-05, |
|
"loss": 11.7643, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005037783375314861, |
|
"eval_loss": 11.775632858276367, |
|
"eval_runtime": 0.6521, |
|
"eval_samples_per_second": 128.824, |
|
"eval_steps_per_second": 64.412, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010075566750629723, |
|
"grad_norm": 0.17478109896183014, |
|
"learning_rate": 4e-05, |
|
"loss": 11.7898, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.015113350125944584, |
|
"grad_norm": 0.17163191735744476, |
|
"learning_rate": 6e-05, |
|
"loss": 11.7762, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.020151133501259445, |
|
"grad_norm": 0.1776304692029953, |
|
"learning_rate": 8e-05, |
|
"loss": 11.7739, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020151133501259445, |
|
"eval_loss": 11.775409698486328, |
|
"eval_runtime": 0.6545, |
|
"eval_samples_per_second": 128.341, |
|
"eval_steps_per_second": 64.17, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02518891687657431, |
|
"grad_norm": 0.20767977833747864, |
|
"learning_rate": 0.0001, |
|
"loss": 11.7652, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.030226700251889168, |
|
"grad_norm": 0.1783723384141922, |
|
"learning_rate": 0.00012, |
|
"loss": 11.7811, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03526448362720403, |
|
"grad_norm": 0.2075798362493515, |
|
"learning_rate": 0.00014, |
|
"loss": 11.7613, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04030226700251889, |
|
"grad_norm": 0.17259609699249268, |
|
"learning_rate": 0.00016, |
|
"loss": 11.7766, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04030226700251889, |
|
"eval_loss": 11.7745943069458, |
|
"eval_runtime": 0.6545, |
|
"eval_samples_per_second": 128.344, |
|
"eval_steps_per_second": 64.172, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04534005037783375, |
|
"grad_norm": 0.16603446006774902, |
|
"learning_rate": 0.00018, |
|
"loss": 11.7747, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05037783375314862, |
|
"grad_norm": 0.16154415905475616, |
|
"learning_rate": 0.0002, |
|
"loss": 11.7738, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.055415617128463476, |
|
"grad_norm": 0.21826648712158203, |
|
"learning_rate": 0.0001996917333733128, |
|
"loss": 11.7761, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.060453400503778336, |
|
"grad_norm": 0.2048770934343338, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 11.7763, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.060453400503778336, |
|
"eval_loss": 11.773273468017578, |
|
"eval_runtime": 0.6781, |
|
"eval_samples_per_second": 123.871, |
|
"eval_steps_per_second": 61.935, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0654911838790932, |
|
"grad_norm": 0.12033865600824356, |
|
"learning_rate": 0.00019723699203976766, |
|
"loss": 11.7726, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07052896725440806, |
|
"grad_norm": 0.1583612710237503, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 11.772, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07556675062972293, |
|
"grad_norm": 0.1649715006351471, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 11.7676, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08060453400503778, |
|
"grad_norm": 0.1502542793750763, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 11.7657, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08060453400503778, |
|
"eval_loss": 11.771785736083984, |
|
"eval_runtime": 0.6579, |
|
"eval_samples_per_second": 127.688, |
|
"eval_steps_per_second": 63.844, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08564231738035265, |
|
"grad_norm": 0.15804621577262878, |
|
"learning_rate": 0.00018526401643540922, |
|
"loss": 11.7648, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0906801007556675, |
|
"grad_norm": 0.1290988177061081, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 11.7736, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.09571788413098237, |
|
"grad_norm": 0.15611040592193604, |
|
"learning_rate": 0.0001760405965600031, |
|
"loss": 11.7751, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.10075566750629723, |
|
"grad_norm": 0.1558377742767334, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 11.7821, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10075566750629723, |
|
"eval_loss": 11.77033805847168, |
|
"eval_runtime": 0.6652, |
|
"eval_samples_per_second": 126.274, |
|
"eval_steps_per_second": 63.137, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10579345088161209, |
|
"grad_norm": 0.14136968553066254, |
|
"learning_rate": 0.00016494480483301836, |
|
"loss": 11.7598, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.11083123425692695, |
|
"grad_norm": 0.1827390193939209, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 11.7637, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.11586901763224182, |
|
"grad_norm": 0.146404430270195, |
|
"learning_rate": 0.0001522498564715949, |
|
"loss": 11.7784, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.12090680100755667, |
|
"grad_norm": 0.27841717004776, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 11.7707, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.12090680100755667, |
|
"eval_loss": 11.768901824951172, |
|
"eval_runtime": 0.6686, |
|
"eval_samples_per_second": 125.642, |
|
"eval_steps_per_second": 62.821, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.12594458438287154, |
|
"grad_norm": 0.15636301040649414, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 11.7764, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1309823677581864, |
|
"grad_norm": 0.17114318907260895, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 11.7703, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.13602015113350127, |
|
"grad_norm": 0.20548300445079803, |
|
"learning_rate": 0.00012334453638559057, |
|
"loss": 11.7773, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.14105793450881612, |
|
"grad_norm": 0.15028426051139832, |
|
"learning_rate": 0.0001156434465040231, |
|
"loss": 11.7642, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.14105793450881612, |
|
"eval_loss": 11.767555236816406, |
|
"eval_runtime": 0.6574, |
|
"eval_samples_per_second": 127.78, |
|
"eval_steps_per_second": 63.89, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.14609571788413098, |
|
"grad_norm": 0.13849467039108276, |
|
"learning_rate": 0.0001078459095727845, |
|
"loss": 11.7754, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.15113350125944586, |
|
"grad_norm": 0.24436236917972565, |
|
"learning_rate": 0.0001, |
|
"loss": 11.777, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1561712846347607, |
|
"grad_norm": 0.12146396189928055, |
|
"learning_rate": 9.215409042721552e-05, |
|
"loss": 11.7583, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.16120906801007556, |
|
"grad_norm": 0.13603943586349487, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 11.7767, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.16120906801007556, |
|
"eval_loss": 11.76652717590332, |
|
"eval_runtime": 0.6557, |
|
"eval_samples_per_second": 128.109, |
|
"eval_steps_per_second": 64.054, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.16624685138539042, |
|
"grad_norm": 0.1841493397951126, |
|
"learning_rate": 7.66554636144095e-05, |
|
"loss": 11.7572, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1712846347607053, |
|
"grad_norm": 0.1587410420179367, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 11.7721, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.17632241813602015, |
|
"grad_norm": 0.1748318076133728, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 11.7697, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.181360201511335, |
|
"grad_norm": 0.15539947152137756, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 11.7722, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.181360201511335, |
|
"eval_loss": 11.765742301940918, |
|
"eval_runtime": 0.6575, |
|
"eval_samples_per_second": 127.75, |
|
"eval_steps_per_second": 63.875, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.18639798488664988, |
|
"grad_norm": 0.1843356043100357, |
|
"learning_rate": 4.7750143528405126e-05, |
|
"loss": 11.7612, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.19143576826196473, |
|
"grad_norm": 0.14512236416339874, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 11.7649, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1964735516372796, |
|
"grad_norm": 0.14143380522727966, |
|
"learning_rate": 3.5055195166981645e-05, |
|
"loss": 11.7659, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.20151133501259447, |
|
"grad_norm": 0.15115749835968018, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 11.7692, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.20151133501259447, |
|
"eval_loss": 11.76524829864502, |
|
"eval_runtime": 0.6559, |
|
"eval_samples_per_second": 128.077, |
|
"eval_steps_per_second": 64.039, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.20654911838790932, |
|
"grad_norm": 0.16189849376678467, |
|
"learning_rate": 2.3959403439996907e-05, |
|
"loss": 11.7604, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.21158690176322417, |
|
"grad_norm": 0.15517611801624298, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 11.7661, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.21662468513853905, |
|
"grad_norm": 0.15952840447425842, |
|
"learning_rate": 1.4735983564590783e-05, |
|
"loss": 11.7654, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2216624685138539, |
|
"grad_norm": 0.18206751346588135, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 11.7605, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2216624685138539, |
|
"eval_loss": 11.764971733093262, |
|
"eval_runtime": 0.6686, |
|
"eval_samples_per_second": 125.638, |
|
"eval_steps_per_second": 62.819, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.22670025188916876, |
|
"grad_norm": 0.14965149760246277, |
|
"learning_rate": 7.612046748871327e-06, |
|
"loss": 11.7675, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.23173803526448364, |
|
"grad_norm": 0.16042684018611908, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 11.7729, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2367758186397985, |
|
"grad_norm": 0.2848449945449829, |
|
"learning_rate": 2.7630079602323442e-06, |
|
"loss": 11.7682, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.24181360201511334, |
|
"grad_norm": 0.16414609551429749, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 11.7582, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.24181360201511334, |
|
"eval_loss": 11.764909744262695, |
|
"eval_runtime": 0.6556, |
|
"eval_samples_per_second": 128.135, |
|
"eval_steps_per_second": 64.068, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.24685138539042822, |
|
"grad_norm": 0.16592900454998016, |
|
"learning_rate": 3.0826662668720364e-07, |
|
"loss": 11.7754, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2518891687657431, |
|
"grad_norm": 0.19010961055755615, |
|
"learning_rate": 0.0, |
|
"loss": 11.7793, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 4, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 10185002188800.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|