|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.1191663819610522, |
|
"eval_steps": 100, |
|
"global_step": 1024, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01093269559275709, |
|
"grad_norm": 165.28294372558594, |
|
"learning_rate": 0.0002, |
|
"loss": 624.4996, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02186539118551418, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.00019996112676182827, |
|
"loss": 546.1801, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03279808677827127, |
|
"grad_norm": 205.6680450439453, |
|
"learning_rate": 0.0001998267889368103, |
|
"loss": 510.2819, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04373078237102836, |
|
"grad_norm": 128.06765747070312, |
|
"learning_rate": 0.00019959663551763642, |
|
"loss": 497.8397, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05466347796378545, |
|
"grad_norm": 146.53973388671875, |
|
"learning_rate": 0.0001992708874098054, |
|
"loss": 495.9984, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06559617355654254, |
|
"grad_norm": 99.703857421875, |
|
"learning_rate": 0.0001988498572723623, |
|
"loss": 494.1846, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07652886914929963, |
|
"grad_norm": 151.4725341796875, |
|
"learning_rate": 0.00019833394921780245, |
|
"loss": 490.978, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08746156474205671, |
|
"grad_norm": 112.11941528320312, |
|
"learning_rate": 0.00019772365842419677, |
|
"loss": 480.1951, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0983942603348138, |
|
"grad_norm": 155.3424530029297, |
|
"learning_rate": 0.0001970195706599109, |
|
"loss": 471.4008, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1093269559275709, |
|
"grad_norm": 175.84173583984375, |
|
"learning_rate": 0.00019622236172137374, |
|
"loss": 468.4172, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1093269559275709, |
|
"eval_loss": 7.324180603027344, |
|
"eval_runtime": 58.3519, |
|
"eval_samples_per_second": 160.509, |
|
"eval_steps_per_second": 10.043, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12025965152032798, |
|
"grad_norm": 149.00534057617188, |
|
"learning_rate": 0.0001953327967844356, |
|
"loss": 465.3731, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13119234711308508, |
|
"grad_norm": 236.33877563476562, |
|
"learning_rate": 0.0001943517296699384, |
|
"loss": 464.0816, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14212504270584217, |
|
"grad_norm": 147.4077606201172, |
|
"learning_rate": 0.00019328010202420258, |
|
"loss": 463.7079, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15305773829859926, |
|
"grad_norm": 136.97933959960938, |
|
"learning_rate": 0.00019211894241521758, |
|
"loss": 460.7253, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16399043389135634, |
|
"grad_norm": 67.10714721679688, |
|
"learning_rate": 0.0001908693653454033, |
|
"loss": 454.1115, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17492312948411343, |
|
"grad_norm": 157.962158203125, |
|
"learning_rate": 0.00018953257018189024, |
|
"loss": 451.1601, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1858558250768705, |
|
"grad_norm": 124.69608306884766, |
|
"learning_rate": 0.00018810984000534458, |
|
"loss": 454.4596, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1967885206696276, |
|
"grad_norm": 102.23240661621094, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 455.1331, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20772121626238468, |
|
"grad_norm": 118.30726623535156, |
|
"learning_rate": 0.00018501211803518468, |
|
"loss": 453.586, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2186539118551418, |
|
"grad_norm": 98.61111450195312, |
|
"learning_rate": 0.00018334009949228061, |
|
"loss": 451.1779, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2186539118551418, |
|
"eval_loss": 6.999808311462402, |
|
"eval_runtime": 58.2479, |
|
"eval_samples_per_second": 160.795, |
|
"eval_steps_per_second": 10.06, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22958660744789888, |
|
"grad_norm": 105.02420043945312, |
|
"learning_rate": 0.00018158808958398338, |
|
"loss": 449.5652, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.24051930304065597, |
|
"grad_norm": 236.24205017089844, |
|
"learning_rate": 0.00017975776992173344, |
|
"loss": 449.9907, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.25145199863341305, |
|
"grad_norm": 111.67731475830078, |
|
"learning_rate": 0.00017785089728011798, |
|
"loss": 448.5767, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.26238469422617017, |
|
"grad_norm": 118.7121810913086, |
|
"learning_rate": 0.00017586930191068655, |
|
"loss": 448.7768, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2733173898189272, |
|
"grad_norm": 78.55050659179688, |
|
"learning_rate": 0.00017381488578524173, |
|
"loss": 447.5193, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.28425008541168434, |
|
"grad_norm": 143.79925537109375, |
|
"learning_rate": 0.00017168962077029147, |
|
"loss": 443.3945, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2951827810044414, |
|
"grad_norm": 107.96155548095703, |
|
"learning_rate": 0.00016949554673441534, |
|
"loss": 440.8445, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3061154765971985, |
|
"grad_norm": 86.58111572265625, |
|
"learning_rate": 0.00016723476959036083, |
|
"loss": 439.1689, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31704817218995557, |
|
"grad_norm": 224.989013671875, |
|
"learning_rate": 0.0001649094592737497, |
|
"loss": 444.2089, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3279808677827127, |
|
"grad_norm": 114.99178314208984, |
|
"learning_rate": 0.00016252184766033342, |
|
"loss": 449.9366, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3279808677827127, |
|
"eval_loss": 7.026218414306641, |
|
"eval_runtime": 58.1336, |
|
"eval_samples_per_second": 161.112, |
|
"eval_steps_per_second": 10.08, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.33891356337546974, |
|
"grad_norm": 66.66151428222656, |
|
"learning_rate": 0.0001600742264237979, |
|
"loss": 446.0257, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34984625896822685, |
|
"grad_norm": 83.69017791748047, |
|
"learning_rate": 0.00015756894483617267, |
|
"loss": 438.0339, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.36077895456098397, |
|
"grad_norm": 71.6341552734375, |
|
"learning_rate": 0.0001550084075129563, |
|
"loss": 435.6428, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.371711650153741, |
|
"grad_norm": 76.65645599365234, |
|
"learning_rate": 0.00015239507210512194, |
|
"loss": 437.0487, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.38264434574649814, |
|
"grad_norm": 207.73638916015625, |
|
"learning_rate": 0.00014973144694021876, |
|
"loss": 437.2165, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3935770413392552, |
|
"grad_norm": 90.09326171875, |
|
"learning_rate": 0.00014702008861483266, |
|
"loss": 447.2774, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4045097369320123, |
|
"grad_norm": 60.52599334716797, |
|
"learning_rate": 0.00014426359954071796, |
|
"loss": 442.0876, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.41544243252476937, |
|
"grad_norm": 55.786434173583984, |
|
"learning_rate": 0.00014146462544695426, |
|
"loss": 435.3542, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4263751281175265, |
|
"grad_norm": 185.42239379882812, |
|
"learning_rate": 0.00013862585284052714, |
|
"loss": 431.46, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4373078237102836, |
|
"grad_norm": 90.76365661621094, |
|
"learning_rate": 0.00013575000642776893, |
|
"loss": 435.4126, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4373078237102836, |
|
"eval_loss": 6.82781982421875, |
|
"eval_runtime": 58.072, |
|
"eval_samples_per_second": 161.283, |
|
"eval_steps_per_second": 10.091, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.44824051930304065, |
|
"grad_norm": 70.12236022949219, |
|
"learning_rate": 0.0001328398464991355, |
|
"loss": 436.4229, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.45917321489579777, |
|
"grad_norm": 55.41183090209961, |
|
"learning_rate": 0.00012989816627982848, |
|
"loss": 434.1986, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4701059104885548, |
|
"grad_norm": 247.78492736816406, |
|
"learning_rate": 0.00012692778924880603, |
|
"loss": 432.3758, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.48103860608131194, |
|
"grad_norm": 130.10150146484375, |
|
"learning_rate": 0.0001239315664287558, |
|
"loss": 438.7516, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.491971301674069, |
|
"grad_norm": 124.00569915771484, |
|
"learning_rate": 0.00012091237364963071, |
|
"loss": 443.4303, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5029039972668261, |
|
"grad_norm": 70.69654846191406, |
|
"learning_rate": 0.00011787310878837422, |
|
"loss": 439.3571, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5138366928595832, |
|
"grad_norm": 48.626583099365234, |
|
"learning_rate": 0.00011481668898748475, |
|
"loss": 433.0221, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5247693884523403, |
|
"grad_norm": 47.17863464355469, |
|
"learning_rate": 0.00011174604785508813, |
|
"loss": 428.8387, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5357020840450973, |
|
"grad_norm": 84.66542053222656, |
|
"learning_rate": 0.00010866413264920678, |
|
"loss": 426.4283, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5466347796378545, |
|
"grad_norm": 213.61886596679688, |
|
"learning_rate": 0.00010557390144892684, |
|
"loss": 427.7443, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5466347796378545, |
|
"eval_loss": 6.722092151641846, |
|
"eval_runtime": 58.1233, |
|
"eval_samples_per_second": 161.14, |
|
"eval_steps_per_second": 10.082, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5575674752306116, |
|
"grad_norm": 122.73770141601562, |
|
"learning_rate": 0.0001024783203151793, |
|
"loss": 433.5011, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5685001708233687, |
|
"grad_norm": 101.38385009765625, |
|
"learning_rate": 9.938036044386005e-05, |
|
"loss": 435.1738, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5794328664161257, |
|
"grad_norm": 114.54944610595703, |
|
"learning_rate": 9.628299531402117e-05, |
|
"loss": 434.8967, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5903655620088828, |
|
"grad_norm": 84.79987335205078, |
|
"learning_rate": 9.318919783387094e-05, |
|
"loss": 432.9765, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6012982576016399, |
|
"grad_norm": 61.6598014831543, |
|
"learning_rate": 9.010193748732155e-05, |
|
"loss": 430.1559, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.612230953194397, |
|
"grad_norm": 99.44583129882812, |
|
"learning_rate": 8.702417748382385e-05, |
|
"loss": 429.6009, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6231636487871541, |
|
"grad_norm": 74.24359130859375, |
|
"learning_rate": 8.395887191422397e-05, |
|
"loss": 430.6433, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6340963443799111, |
|
"grad_norm": 62.34027099609375, |
|
"learning_rate": 8.090896291537273e-05, |
|
"loss": 429.9604, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6450290399726682, |
|
"grad_norm": 142.7094268798828, |
|
"learning_rate": 7.787737784620803e-05, |
|
"loss": 432.6778, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6559617355654254, |
|
"grad_norm": 86.1541748046875, |
|
"learning_rate": 7.486702647802213e-05, |
|
"loss": 433.9323, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6559617355654254, |
|
"eval_loss": 6.775545120239258, |
|
"eval_runtime": 58.0771, |
|
"eval_samples_per_second": 161.268, |
|
"eval_steps_per_second": 10.09, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6668944311581825, |
|
"grad_norm": 62.604488372802734, |
|
"learning_rate": 7.188079820160904e-05, |
|
"loss": 431.3976, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6778271267509395, |
|
"grad_norm": 86.99163818359375, |
|
"learning_rate": 6.892155925397436e-05, |
|
"loss": 429.0467, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6887598223436966, |
|
"grad_norm": 60.84334945678711, |
|
"learning_rate": 6.59921499672677e-05, |
|
"loss": 426.724, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6996925179364537, |
|
"grad_norm": 63.2689208984375, |
|
"learning_rate": 6.309538204257977e-05, |
|
"loss": 426.1021, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7106252135292108, |
|
"grad_norm": 100.661865234375, |
|
"learning_rate": 6.02340358512196e-05, |
|
"loss": 426.2488, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7215579091219679, |
|
"grad_norm": 98.77324676513672, |
|
"learning_rate": 5.7410857766062966e-05, |
|
"loss": 426.7232, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7324906047147249, |
|
"grad_norm": 83.56670379638672, |
|
"learning_rate": 5.4628557525532976e-05, |
|
"loss": 429.1631, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.743423300307482, |
|
"grad_norm": 123.90058135986328, |
|
"learning_rate": 5.188980563274315e-05, |
|
"loss": 430.625, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7543559959002392, |
|
"grad_norm": 96.76585388183594, |
|
"learning_rate": 4.9197230792299195e-05, |
|
"loss": 430.7337, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7652886914929963, |
|
"grad_norm": 110.13320922851562, |
|
"learning_rate": 4.6553417387219886e-05, |
|
"loss": 429.4984, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7652886914929963, |
|
"eval_loss": 6.70954704284668, |
|
"eval_runtime": 58.0031, |
|
"eval_samples_per_second": 161.474, |
|
"eval_steps_per_second": 10.103, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7762213870857533, |
|
"grad_norm": 88.53307342529297, |
|
"learning_rate": 4.396090299839852e-05, |
|
"loss": 429.292, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7871540826785104, |
|
"grad_norm": 83.58421325683594, |
|
"learning_rate": 4.1422175968985955e-05, |
|
"loss": 428.1954, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7980867782712675, |
|
"grad_norm": 68.06840515136719, |
|
"learning_rate": 3.8939673016032953e-05, |
|
"loss": 427.5196, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8090194738640246, |
|
"grad_norm": 101.87394714355469, |
|
"learning_rate": 3.651577689168405e-05, |
|
"loss": 426.9831, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8199521694567817, |
|
"grad_norm": 91.1787338256836, |
|
"learning_rate": 3.415281409616844e-05, |
|
"loss": 426.4039, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8308848650495387, |
|
"grad_norm": 76.29208374023438, |
|
"learning_rate": 3.185305264478159e-05, |
|
"loss": 427.1363, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8418175606422958, |
|
"grad_norm": 106.453369140625, |
|
"learning_rate": 2.9618699891002843e-05, |
|
"loss": 428.5345, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.852750256235053, |
|
"grad_norm": 77.79237365722656, |
|
"learning_rate": 2.745190040783646e-05, |
|
"loss": 428.4736, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8636829518278101, |
|
"grad_norm": 78.0290756225586, |
|
"learning_rate": 2.5354733929410977e-05, |
|
"loss": 427.3446, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8746156474205672, |
|
"grad_norm": 82.6236801147461, |
|
"learning_rate": 2.332921335481205e-05, |
|
"loss": 426.4013, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8746156474205672, |
|
"eval_loss": 6.655261516571045, |
|
"eval_runtime": 57.9158, |
|
"eval_samples_per_second": 161.718, |
|
"eval_steps_per_second": 10.118, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8855483430133242, |
|
"grad_norm": 88.00137329101562, |
|
"learning_rate": 2.137728281606475e-05, |
|
"loss": 425.3787, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8964810386060813, |
|
"grad_norm": 68.48820495605469, |
|
"learning_rate": 2e-05, |
|
"loss": 425.5655, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9074137341988384, |
|
"grad_norm": 99.16329956054688, |
|
"learning_rate": 2e-05, |
|
"loss": 425.2563, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9183464297915955, |
|
"grad_norm": 82.26581573486328, |
|
"learning_rate": 2e-05, |
|
"loss": 424.7282, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9292791253843525, |
|
"grad_norm": 76.19690704345703, |
|
"learning_rate": 2e-05, |
|
"loss": 424.4672, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9402118209771096, |
|
"grad_norm": 76.67041778564453, |
|
"learning_rate": 2e-05, |
|
"loss": 424.8196, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9511445165698668, |
|
"grad_norm": 91.26239013671875, |
|
"learning_rate": 2e-05, |
|
"loss": 424.8494, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9620772121626239, |
|
"grad_norm": 87.45671844482422, |
|
"learning_rate": 2e-05, |
|
"loss": 424.553, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.973009907755381, |
|
"grad_norm": 83.90388488769531, |
|
"learning_rate": 2e-05, |
|
"loss": 424.6736, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.983942603348138, |
|
"grad_norm": 92.74352264404297, |
|
"learning_rate": 2e-05, |
|
"loss": 425.6448, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.983942603348138, |
|
"eval_loss": 6.660056114196777, |
|
"eval_runtime": 57.8375, |
|
"eval_samples_per_second": 161.937, |
|
"eval_steps_per_second": 10.132, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9948752989408951, |
|
"grad_norm": 82.22098541259766, |
|
"learning_rate": 2e-05, |
|
"loss": 426.0456, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0054663477963786, |
|
"grad_norm": 85.19371032714844, |
|
"learning_rate": 2e-05, |
|
"loss": 413.45, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0163990433891357, |
|
"grad_norm": 114.56877899169922, |
|
"learning_rate": 2e-05, |
|
"loss": 426.7532, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.0273317389818928, |
|
"grad_norm": 101.91117858886719, |
|
"learning_rate": 2e-05, |
|
"loss": 426.5692, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.03826443457465, |
|
"grad_norm": 98.91850280761719, |
|
"learning_rate": 2e-05, |
|
"loss": 427.7487, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.049197130167407, |
|
"grad_norm": 124.97265625, |
|
"learning_rate": 2e-05, |
|
"loss": 427.6109, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.060129825760164, |
|
"grad_norm": 118.64958190917969, |
|
"learning_rate": 2e-05, |
|
"loss": 427.6372, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.071062521352921, |
|
"grad_norm": 109.47184753417969, |
|
"learning_rate": 2e-05, |
|
"loss": 428.8791, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.0819952169456781, |
|
"grad_norm": 130.62338256835938, |
|
"learning_rate": 2e-05, |
|
"loss": 429.3186, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.0929279125384352, |
|
"grad_norm": 127.20825958251953, |
|
"learning_rate": 2e-05, |
|
"loss": 429.1682, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0929279125384352, |
|
"eval_loss": 6.715859413146973, |
|
"eval_runtime": 57.8946, |
|
"eval_samples_per_second": 161.777, |
|
"eval_steps_per_second": 10.122, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1038606081311924, |
|
"grad_norm": 105.07234191894531, |
|
"learning_rate": 2e-05, |
|
"loss": 429.3496, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.1147933037239495, |
|
"grad_norm": 143.77236938476562, |
|
"learning_rate": 2e-05, |
|
"loss": 429.6381, |
|
"step": 1020 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1024, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1024, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.705517388910559e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|