|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.971014492753623, |
|
"eval_steps": 12, |
|
"global_step": 92, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9959812474764307, |
|
"learning_rate": 0.0001999417022366174, |
|
"loss": 2.8926, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7616989612579346, |
|
"eval_runtime": 62.0214, |
|
"eval_samples_per_second": 322.227, |
|
"eval_steps_per_second": 20.154, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9553562375664082, |
|
"learning_rate": 0.00019976687691905393, |
|
"loss": 2.7178, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.074690670043139, |
|
"learning_rate": 0.00019947572788580947, |
|
"loss": 2.3477, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2607308881446835, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 2.0869, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5197691743409516, |
|
"learning_rate": 0.00019854595177171968, |
|
"loss": 1.8477, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7112299977659389, |
|
"learning_rate": 0.00019790840876823232, |
|
"loss": 1.7227, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7680093267515216, |
|
"learning_rate": 0.00019715670893979414, |
|
"loss": 1.6553, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7744797513985666, |
|
"learning_rate": 0.00019629172873477995, |
|
"loss": 1.6455, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.735518802506217, |
|
"learning_rate": 0.00019531447668141608, |
|
"loss": 1.5508, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.4535717241942791, |
|
"learning_rate": 0.00019422609221188207, |
|
"loss": 1.5039, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.29477652700703394, |
|
"learning_rate": 0.0001930278443337833, |
|
"loss": 1.502, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.24434475566463373, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 1.4502, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 1.456370234489441, |
|
"eval_runtime": 63.0041, |
|
"eval_samples_per_second": 317.202, |
|
"eval_steps_per_second": 19.84, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.45228370307484317, |
|
"learning_rate": 0.00019030747323245327, |
|
"loss": 1.4277, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.46523254227617566, |
|
"learning_rate": 0.0001887885218402375, |
|
"loss": 1.4097, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2855872854921807, |
|
"learning_rate": 0.00018716604700327514, |
|
"loss": 1.3833, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.25447863105741503, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 1.3652, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.3619444462250106, |
|
"learning_rate": 0.0001836182124254711, |
|
"loss": 1.3677, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.36635956863370217, |
|
"learning_rate": 0.0001816969893010442, |
|
"loss": 1.332, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.39509330975297474, |
|
"learning_rate": 0.00017968051114159047, |
|
"loss": 1.3086, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.3249913054000147, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 1.3071, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.30914838660163285, |
|
"learning_rate": 0.00017537130253273613, |
|
"loss": 1.2729, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.42289317853499514, |
|
"learning_rate": 0.00017308359642781242, |
|
"loss": 1.2773, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.35414226064550436, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 1.2417, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.5532132927649234, |
|
"learning_rate": 0.00016825531432186543, |
|
"loss": 1.7617, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 1.314684271812439, |
|
"eval_runtime": 63.1261, |
|
"eval_samples_per_second": 316.588, |
|
"eval_steps_per_second": 19.802, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.12961326680655122, |
|
"learning_rate": 0.00016572036788179727, |
|
"loss": 1.2183, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.1963943275104468, |
|
"learning_rate": 0.00016310879443260528, |
|
"loss": 1.2222, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.21010667932420385, |
|
"learning_rate": 0.00016042363895210946, |
|
"loss": 1.2163, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2259721381332128, |
|
"learning_rate": 0.00015766803221148673, |
|
"loss": 1.2217, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.19095866752454876, |
|
"learning_rate": 0.00015484518712493187, |
|
"loss": 1.2212, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.11496636776764421, |
|
"learning_rate": 0.00015195839500354335, |
|
"loss": 1.2212, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.12075046364081166, |
|
"learning_rate": 0.00014901102171780174, |
|
"loss": 1.1616, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.21352779335608432, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 1.2002, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.20188789347169253, |
|
"learning_rate": 0.0001429483443030082, |
|
"loss": 1.2236, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.1547188722220508, |
|
"learning_rate": 0.00013984010898462416, |
|
"loss": 1.1826, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.11634196980431331, |
|
"learning_rate": 0.00013668542188130566, |
|
"loss": 1.209, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.11144391112143442, |
|
"learning_rate": 0.00013348796121709862, |
|
"loss": 1.2051, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 1.2780728340148926, |
|
"eval_runtime": 63.136, |
|
"eval_samples_per_second": 316.539, |
|
"eval_steps_per_second": 19.799, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.16076857117392804, |
|
"learning_rate": 0.0001302514550881076, |
|
"loss": 1.2046, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.15687324678725567, |
|
"learning_rate": 0.00012697967711570242, |
|
"loss": 1.21, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.4130016628721442, |
|
"learning_rate": 0.00012367644204664468, |
|
"loss": 1.5093, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.1343313559360696, |
|
"learning_rate": 0.0001203456013052634, |
|
"loss": 1.2051, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.11359323094812154, |
|
"learning_rate": 0.00011699103850286669, |
|
"loss": 1.2031, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.09939941549415923, |
|
"learning_rate": 0.00011361666490962468, |
|
"loss": 1.1929, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.09916599553279144, |
|
"learning_rate": 0.00011022641489420342, |
|
"loss": 1.1748, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.14209942605129547, |
|
"learning_rate": 0.0001068242413364671, |
|
"loss": 1.1665, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.12442345202001188, |
|
"learning_rate": 0.00010341411101859679, |
|
"loss": 1.1768, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.26217962366852043, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5122, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.09589512696932487, |
|
"learning_rate": 9.658588898140322e-05, |
|
"loss": 1.1147, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.08883256552206654, |
|
"learning_rate": 9.317575866353292e-05, |
|
"loss": 1.1353, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"eval_loss": 1.2603201866149902, |
|
"eval_runtime": 63.1562, |
|
"eval_samples_per_second": 316.438, |
|
"eval_steps_per_second": 19.792, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.12476996026721204, |
|
"learning_rate": 8.977358510579657e-05, |
|
"loss": 1.1499, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.1410594758409736, |
|
"learning_rate": 8.638333509037536e-05, |
|
"loss": 1.1636, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.13805158665360767, |
|
"learning_rate": 8.300896149713334e-05, |
|
"loss": 1.1592, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.11182820786743164, |
|
"learning_rate": 7.965439869473664e-05, |
|
"loss": 1.189, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.08415581979953264, |
|
"learning_rate": 7.632355795335533e-05, |
|
"loss": 1.1362, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.08283198190447906, |
|
"learning_rate": 7.302032288429756e-05, |
|
"loss": 1.1543, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.09566331541455832, |
|
"learning_rate": 6.974854491189243e-05, |
|
"loss": 1.1475, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.12102788241266879, |
|
"learning_rate": 6.651203878290139e-05, |
|
"loss": 1.123, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.12120546678963659, |
|
"learning_rate": 6.331457811869437e-05, |
|
"loss": 1.1235, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.10373804201590558, |
|
"learning_rate": 6.015989101537586e-05, |
|
"loss": 1.1987, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.08553749701513035, |
|
"learning_rate": 5.7051655696991826e-05, |
|
"loss": 1.1616, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.07986407048448123, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 1.1787, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 1.2497843503952026, |
|
"eval_runtime": 63.2115, |
|
"eval_samples_per_second": 316.161, |
|
"eval_steps_per_second": 19.775, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.07539804797614923, |
|
"learning_rate": 5.0988978282198305e-05, |
|
"loss": 1.125, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.09159666914568447, |
|
"learning_rate": 4.804160499645667e-05, |
|
"loss": 1.1128, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.1010418577495551, |
|
"learning_rate": 4.515481287506811e-05, |
|
"loss": 1.1729, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.11598213897408995, |
|
"learning_rate": 4.2331967788513295e-05, |
|
"loss": 1.1606, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.09017416588433445, |
|
"learning_rate": 3.9576361047890554e-05, |
|
"loss": 1.1924, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.07860813360953343, |
|
"learning_rate": 3.689120556739475e-05, |
|
"loss": 1.1636, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.07385649237022004, |
|
"learning_rate": 3.427963211820274e-05, |
|
"loss": 1.1133, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.07323818513742975, |
|
"learning_rate": 3.174468567813461e-05, |
|
"loss": 1.1528, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.07313504011051983, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 1.1528, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.2493580533486283, |
|
"learning_rate": 2.691640357218759e-05, |
|
"loss": 1.4531, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.08492071745345914, |
|
"learning_rate": 2.4628697467263918e-05, |
|
"loss": 1.1367, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.09336318243714266, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 1.1416, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_loss": 1.2444965839385986, |
|
"eval_runtime": 63.1509, |
|
"eval_samples_per_second": 316.464, |
|
"eval_steps_per_second": 19.794, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.08651448401545622, |
|
"learning_rate": 2.0319488858409553e-05, |
|
"loss": 1.1528, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.08082722437043596, |
|
"learning_rate": 1.8303010698955804e-05, |
|
"loss": 1.1323, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.07673966178613437, |
|
"learning_rate": 1.638178757452894e-05, |
|
"loss": 1.1548, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.07500837050855058, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 1.1113, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.07854170828608634, |
|
"learning_rate": 1.2833952996724863e-05, |
|
"loss": 1.1426, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.07331164880963319, |
|
"learning_rate": 1.1211478159762478e-05, |
|
"loss": 1.1299, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.07647451775596137, |
|
"learning_rate": 9.692526767546729e-06, |
|
"loss": 1.1191, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.07236874800404586, |
|
"learning_rate": 8.278869849454718e-06, |
|
"loss": 1.1167, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.06955020330114496, |
|
"learning_rate": 6.972155666216684e-06, |
|
"loss": 1.1191, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.07541349884268325, |
|
"learning_rate": 5.77390778811796e-06, |
|
"loss": 1.1294, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.07686237827636627, |
|
"learning_rate": 4.685523318583918e-06, |
|
"loss": 1.146, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.0739857032690884, |
|
"learning_rate": 3.7082712652200867e-06, |
|
"loss": 1.1606, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 1.243038535118103, |
|
"eval_runtime": 63.1702, |
|
"eval_samples_per_second": 316.368, |
|
"eval_steps_per_second": 19.788, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.07147810640105105, |
|
"learning_rate": 2.843291060205855e-06, |
|
"loss": 1.1396, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.07131641333572095, |
|
"learning_rate": 2.091591231767709e-06, |
|
"loss": 1.1255, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.08365332706415897, |
|
"learning_rate": 1.4540482282803137e-06, |
|
"loss": 1.1362, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.08023410527314905, |
|
"learning_rate": 9.314053963669245e-07, |
|
"loss": 1.1489, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.07738378998352305, |
|
"learning_rate": 5.24272114190516e-07, |
|
"loss": 1.1167, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.07310454468648853, |
|
"learning_rate": 2.3312308094607382e-07, |
|
"loss": 1.1201, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.07335973208363882, |
|
"learning_rate": 5.8297763382597626e-08, |
|
"loss": 1.1724, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.07850704063156043, |
|
"learning_rate": 0.0, |
|
"loss": 1.124, |
|
"step": 92 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 92, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 46, |
|
"total_flos": 2.193460016625746e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|