diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6606 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500.0, + "global_step": 18789, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003193357815743254, + "grad_norm": 1.9077140880050831, + "learning_rate": 2.1287919105907397e-07, + "loss": 0.293, + "step": 20 + }, + { + "epoch": 0.006386715631486508, + "grad_norm": 1.2741126203759185, + "learning_rate": 4.2575838211814794e-07, + "loss": 0.2755, + "step": 40 + }, + { + "epoch": 0.009580073447229762, + "grad_norm": 0.9579957272491677, + "learning_rate": 6.38637573177222e-07, + "loss": 0.2663, + "step": 60 + }, + { + "epoch": 0.012773431262973017, + "grad_norm": 0.7572791753798676, + "learning_rate": 8.515167642362959e-07, + "loss": 0.2483, + "step": 80 + }, + { + "epoch": 0.01596678907871627, + "grad_norm": 0.7631906150781687, + "learning_rate": 1.0643959552953699e-06, + "loss": 0.2345, + "step": 100 + }, + { + "epoch": 0.019160146894459523, + "grad_norm": 0.6910231975995771, + "learning_rate": 1.277275146354444e-06, + "loss": 0.2325, + "step": 120 + }, + { + "epoch": 0.02235350471020278, + "grad_norm": 0.6148039205762914, + "learning_rate": 1.490154337413518e-06, + "loss": 0.2204, + "step": 140 + }, + { + "epoch": 0.025546862525946033, + "grad_norm": 0.5789874935423447, + "learning_rate": 1.7030335284725918e-06, + "loss": 0.2098, + "step": 160 + }, + { + "epoch": 0.028740220341689285, + "grad_norm": 0.7476463496285388, + "learning_rate": 1.915912719531666e-06, + "loss": 0.2032, + "step": 180 + }, + { + "epoch": 0.03193357815743254, + "grad_norm": 0.7428740533196578, + "learning_rate": 2.1287919105907398e-06, + "loss": 0.2012, + "step": 200 + }, + { + "epoch": 0.035126935973175795, + "grad_norm": 0.7037948631808884, + "learning_rate": 2.341671101649814e-06, + "loss": 0.2009, + "step": 220 + }, + { + "epoch": 0.038320293788919046, + "grad_norm": 0.7550244833274011, + "learning_rate": 2.554550292708888e-06, + "loss": 0.1909, + "step": 240 + }, + { + "epoch": 0.041513651604662305, + "grad_norm": 0.6563323462206586, + "learning_rate": 2.7674294837679623e-06, + "loss": 0.1921, + "step": 260 + }, + { + "epoch": 0.04470700942040556, + "grad_norm": 0.6164020880535914, + "learning_rate": 2.980308674827036e-06, + "loss": 0.18, + "step": 280 + }, + { + "epoch": 0.04790036723614881, + "grad_norm": 0.6690736448965662, + "learning_rate": 3.1931878658861097e-06, + "loss": 0.1775, + "step": 300 + }, + { + "epoch": 0.05109372505189207, + "grad_norm": 0.6290006787340572, + "learning_rate": 3.4060670569451835e-06, + "loss": 0.1806, + "step": 320 + }, + { + "epoch": 0.05428708286763532, + "grad_norm": 0.6635212199407985, + "learning_rate": 3.6189462480042583e-06, + "loss": 0.1779, + "step": 340 + }, + { + "epoch": 0.05748044068337857, + "grad_norm": 0.6197593777818997, + "learning_rate": 3.831825439063332e-06, + "loss": 0.1702, + "step": 360 + }, + { + "epoch": 0.06067379849912183, + "grad_norm": 0.7111109613175086, + "learning_rate": 4.044704630122406e-06, + "loss": 0.1726, + "step": 380 + }, + { + "epoch": 0.06386715631486509, + "grad_norm": 0.7014844198325911, + "learning_rate": 4.2575838211814795e-06, + "loss": 0.1677, + "step": 400 + }, + { + "epoch": 0.06706051413060833, + "grad_norm": 0.6960545421354288, + "learning_rate": 4.470463012240554e-06, + "loss": 0.1661, + "step": 420 + }, + { + "epoch": 0.07025387194635159, + "grad_norm": 0.6526527280390234, + "learning_rate": 4.683342203299628e-06, + "loss": 0.1599, + "step": 440 + }, + { + "epoch": 0.07344722976209485, + "grad_norm": 0.6231986280201518, + "learning_rate": 4.896221394358702e-06, + "loss": 0.1608, + "step": 460 + }, + { + "epoch": 0.07664058757783809, + "grad_norm": 0.6963837819044139, + "learning_rate": 5.109100585417776e-06, + "loss": 0.1622, + "step": 480 + }, + { + "epoch": 0.07983394539358135, + "grad_norm": 0.6109671730909707, + "learning_rate": 5.32197977647685e-06, + "loss": 0.1598, + "step": 500 + }, + { + "epoch": 0.08302730320932461, + "grad_norm": 0.5371960923403704, + "learning_rate": 5.534858967535925e-06, + "loss": 0.1594, + "step": 520 + }, + { + "epoch": 0.08622066102506785, + "grad_norm": 0.5367820828152228, + "learning_rate": 5.747738158594997e-06, + "loss": 0.1596, + "step": 540 + }, + { + "epoch": 0.08941401884081111, + "grad_norm": 0.5470627592374788, + "learning_rate": 5.960617349654072e-06, + "loss": 0.1572, + "step": 560 + }, + { + "epoch": 0.09260737665655437, + "grad_norm": 0.6499813395079859, + "learning_rate": 6.173496540713145e-06, + "loss": 0.1608, + "step": 580 + }, + { + "epoch": 0.09580073447229762, + "grad_norm": 0.5979118456987372, + "learning_rate": 6.386375731772219e-06, + "loss": 0.1556, + "step": 600 + }, + { + "epoch": 0.09899409228804087, + "grad_norm": 0.6192365185802242, + "learning_rate": 6.5992549228312945e-06, + "loss": 0.1558, + "step": 620 + }, + { + "epoch": 0.10218745010378413, + "grad_norm": 0.6113365486687661, + "learning_rate": 6.812134113890367e-06, + "loss": 0.1529, + "step": 640 + }, + { + "epoch": 0.10538080791952738, + "grad_norm": 0.5734345240030044, + "learning_rate": 7.025013304949441e-06, + "loss": 0.1569, + "step": 660 + }, + { + "epoch": 0.10857416573527064, + "grad_norm": 0.5661084962098065, + "learning_rate": 7.2378924960085166e-06, + "loss": 0.1516, + "step": 680 + }, + { + "epoch": 0.1117675235510139, + "grad_norm": 0.5846067384703525, + "learning_rate": 7.450771687067589e-06, + "loss": 0.15, + "step": 700 + }, + { + "epoch": 0.11496088136675714, + "grad_norm": 0.6374604468076045, + "learning_rate": 7.663650878126664e-06, + "loss": 0.1595, + "step": 720 + }, + { + "epoch": 0.1181542391825004, + "grad_norm": 0.6518179456002492, + "learning_rate": 7.876530069185738e-06, + "loss": 0.1534, + "step": 740 + }, + { + "epoch": 0.12134759699824366, + "grad_norm": 0.6418033051046158, + "learning_rate": 8.089409260244812e-06, + "loss": 0.1544, + "step": 760 + }, + { + "epoch": 0.1245409548139869, + "grad_norm": 0.5560485727454112, + "learning_rate": 8.302288451303886e-06, + "loss": 0.1519, + "step": 780 + }, + { + "epoch": 0.12773431262973017, + "grad_norm": 0.5218192022156695, + "learning_rate": 8.515167642362959e-06, + "loss": 0.1526, + "step": 800 + }, + { + "epoch": 0.1309276704454734, + "grad_norm": 0.5873485030073047, + "learning_rate": 8.728046833422033e-06, + "loss": 0.1477, + "step": 820 + }, + { + "epoch": 0.13412102826121666, + "grad_norm": 0.52521133386056, + "learning_rate": 8.940926024481108e-06, + "loss": 0.1462, + "step": 840 + }, + { + "epoch": 0.13731438607695992, + "grad_norm": 0.49868364100795765, + "learning_rate": 9.153805215540182e-06, + "loss": 0.1459, + "step": 860 + }, + { + "epoch": 0.14050774389270318, + "grad_norm": 0.46689167597503883, + "learning_rate": 9.366684406599256e-06, + "loss": 0.1522, + "step": 880 + }, + { + "epoch": 0.14370110170844644, + "grad_norm": 0.5355408722325465, + "learning_rate": 9.57956359765833e-06, + "loss": 0.1512, + "step": 900 + }, + { + "epoch": 0.1468944595241897, + "grad_norm": 0.46406202651388007, + "learning_rate": 9.792442788717403e-06, + "loss": 0.1503, + "step": 920 + }, + { + "epoch": 0.15008781733993293, + "grad_norm": 0.5480845186900385, + "learning_rate": 1.0005321979776476e-05, + "loss": 0.1474, + "step": 940 + }, + { + "epoch": 0.15328117515567619, + "grad_norm": 0.5541284715722858, + "learning_rate": 1.0218201170835552e-05, + "loss": 0.1469, + "step": 960 + }, + { + "epoch": 0.15647453297141944, + "grad_norm": 0.6186186072342443, + "learning_rate": 1.0431080361894626e-05, + "loss": 0.1495, + "step": 980 + }, + { + "epoch": 0.1596678907871627, + "grad_norm": 0.5921353515589192, + "learning_rate": 1.06439595529537e-05, + "loss": 0.1463, + "step": 1000 + }, + { + "epoch": 0.16286124860290596, + "grad_norm": 0.5433507613364311, + "learning_rate": 1.0856838744012775e-05, + "loss": 0.1472, + "step": 1020 + }, + { + "epoch": 0.16605460641864922, + "grad_norm": 0.5979565391588779, + "learning_rate": 1.106971793507185e-05, + "loss": 0.1506, + "step": 1040 + }, + { + "epoch": 0.16924796423439245, + "grad_norm": 0.48287899522708827, + "learning_rate": 1.128259712613092e-05, + "loss": 0.1486, + "step": 1060 + }, + { + "epoch": 0.1724413220501357, + "grad_norm": 0.5655815878985752, + "learning_rate": 1.1495476317189994e-05, + "loss": 0.1457, + "step": 1080 + }, + { + "epoch": 0.17563467986587897, + "grad_norm": 0.512570124033007, + "learning_rate": 1.170835550824907e-05, + "loss": 0.1461, + "step": 1100 + }, + { + "epoch": 0.17882803768162223, + "grad_norm": 0.46185823489920147, + "learning_rate": 1.1921234699308145e-05, + "loss": 0.144, + "step": 1120 + }, + { + "epoch": 0.18202139549736548, + "grad_norm": 0.44757029208547805, + "learning_rate": 1.2134113890367219e-05, + "loss": 0.1473, + "step": 1140 + }, + { + "epoch": 0.18521475331310874, + "grad_norm": 0.4396528603456714, + "learning_rate": 1.234699308142629e-05, + "loss": 0.142, + "step": 1160 + }, + { + "epoch": 0.188408111128852, + "grad_norm": 0.4916233270385887, + "learning_rate": 1.2559872272485364e-05, + "loss": 0.1459, + "step": 1180 + }, + { + "epoch": 0.19160146894459523, + "grad_norm": 0.49164646615860214, + "learning_rate": 1.2772751463544439e-05, + "loss": 0.1442, + "step": 1200 + }, + { + "epoch": 0.1947948267603385, + "grad_norm": 0.47947460832408273, + "learning_rate": 1.2985630654603515e-05, + "loss": 0.1491, + "step": 1220 + }, + { + "epoch": 0.19798818457608175, + "grad_norm": 0.5469424153214216, + "learning_rate": 1.3198509845662589e-05, + "loss": 0.1502, + "step": 1240 + }, + { + "epoch": 0.201181542391825, + "grad_norm": 0.4638562794822929, + "learning_rate": 1.3411389036721663e-05, + "loss": 0.1442, + "step": 1260 + }, + { + "epoch": 0.20437490020756827, + "grad_norm": 0.44344070089234844, + "learning_rate": 1.3624268227780734e-05, + "loss": 0.1488, + "step": 1280 + }, + { + "epoch": 0.20756825802331152, + "grad_norm": 0.47717361703152655, + "learning_rate": 1.3837147418839808e-05, + "loss": 0.1488, + "step": 1300 + }, + { + "epoch": 0.21076161583905476, + "grad_norm": 0.4703261039559369, + "learning_rate": 1.4050026609898883e-05, + "loss": 0.145, + "step": 1320 + }, + { + "epoch": 0.21395497365479801, + "grad_norm": 0.45451915177321617, + "learning_rate": 1.4262905800957957e-05, + "loss": 0.1514, + "step": 1340 + }, + { + "epoch": 0.21714833147054127, + "grad_norm": 0.4705441248128481, + "learning_rate": 1.4475784992017033e-05, + "loss": 0.1487, + "step": 1360 + }, + { + "epoch": 0.22034168928628453, + "grad_norm": 0.4369632636042999, + "learning_rate": 1.4688664183076104e-05, + "loss": 0.1459, + "step": 1380 + }, + { + "epoch": 0.2235350471020278, + "grad_norm": 0.42608257790275605, + "learning_rate": 1.4901543374135178e-05, + "loss": 0.1455, + "step": 1400 + }, + { + "epoch": 0.22672840491777105, + "grad_norm": 0.50356002082837, + "learning_rate": 1.5114422565194253e-05, + "loss": 0.1451, + "step": 1420 + }, + { + "epoch": 0.22992176273351428, + "grad_norm": 0.4561937833231143, + "learning_rate": 1.5327301756253327e-05, + "loss": 0.1477, + "step": 1440 + }, + { + "epoch": 0.23311512054925754, + "grad_norm": 0.40765594909388037, + "learning_rate": 1.55401809473124e-05, + "loss": 0.1425, + "step": 1460 + }, + { + "epoch": 0.2363084783650008, + "grad_norm": 0.487476872013174, + "learning_rate": 1.5753060138371476e-05, + "loss": 0.1412, + "step": 1480 + }, + { + "epoch": 0.23950183618074405, + "grad_norm": 0.4680001690322545, + "learning_rate": 1.596593932943055e-05, + "loss": 0.1443, + "step": 1500 + }, + { + "epoch": 0.2426951939964873, + "grad_norm": 0.41230879655966063, + "learning_rate": 1.6178818520489624e-05, + "loss": 0.1455, + "step": 1520 + }, + { + "epoch": 0.24588855181223057, + "grad_norm": 0.4315075010200903, + "learning_rate": 1.63916977115487e-05, + "loss": 0.1453, + "step": 1540 + }, + { + "epoch": 0.2490819096279738, + "grad_norm": 0.3880821656792041, + "learning_rate": 1.6604576902607773e-05, + "loss": 0.1367, + "step": 1560 + }, + { + "epoch": 0.2522752674437171, + "grad_norm": 0.4170991591966089, + "learning_rate": 1.6817456093666847e-05, + "loss": 0.1444, + "step": 1580 + }, + { + "epoch": 0.25546862525946035, + "grad_norm": 0.4352470317730404, + "learning_rate": 1.7030335284725918e-05, + "loss": 0.1462, + "step": 1600 + }, + { + "epoch": 0.25866198307520355, + "grad_norm": 0.41926976953754025, + "learning_rate": 1.7243214475784992e-05, + "loss": 0.1427, + "step": 1620 + }, + { + "epoch": 0.2618553408909468, + "grad_norm": 0.4067020140616968, + "learning_rate": 1.7456093666844067e-05, + "loss": 0.1435, + "step": 1640 + }, + { + "epoch": 0.26504869870669007, + "grad_norm": 0.4568169742722482, + "learning_rate": 1.766897285790314e-05, + "loss": 0.1425, + "step": 1660 + }, + { + "epoch": 0.2682420565224333, + "grad_norm": 0.4952678328950158, + "learning_rate": 1.7881852048962215e-05, + "loss": 0.1411, + "step": 1680 + }, + { + "epoch": 0.2714354143381766, + "grad_norm": 0.36574600840843885, + "learning_rate": 1.809473124002129e-05, + "loss": 0.1424, + "step": 1700 + }, + { + "epoch": 0.27462877215391984, + "grad_norm": 0.40710244186170225, + "learning_rate": 1.8307610431080364e-05, + "loss": 0.1435, + "step": 1720 + }, + { + "epoch": 0.2778221299696631, + "grad_norm": 0.41415797524036474, + "learning_rate": 1.852048962213944e-05, + "loss": 0.1443, + "step": 1740 + }, + { + "epoch": 0.28101548778540636, + "grad_norm": 0.38093938436737673, + "learning_rate": 1.8733368813198513e-05, + "loss": 0.1459, + "step": 1760 + }, + { + "epoch": 0.2842088456011496, + "grad_norm": 0.36699157301783514, + "learning_rate": 1.8946248004257587e-05, + "loss": 0.1503, + "step": 1780 + }, + { + "epoch": 0.2874022034168929, + "grad_norm": 0.4426133669933364, + "learning_rate": 1.915912719531666e-05, + "loss": 0.1458, + "step": 1800 + }, + { + "epoch": 0.29059556123263613, + "grad_norm": 0.37577866094305634, + "learning_rate": 1.9372006386375732e-05, + "loss": 0.1437, + "step": 1820 + }, + { + "epoch": 0.2937889190483794, + "grad_norm": 0.3853315977661372, + "learning_rate": 1.9584885577434807e-05, + "loss": 0.1424, + "step": 1840 + }, + { + "epoch": 0.29698227686412265, + "grad_norm": 0.39658703817733554, + "learning_rate": 1.979776476849388e-05, + "loss": 0.143, + "step": 1860 + }, + { + "epoch": 0.30017563467986585, + "grad_norm": 0.34168487028906286, + "learning_rate": 1.9999999827423154e-05, + "loss": 0.1472, + "step": 1880 + }, + { + "epoch": 0.3033689924956091, + "grad_norm": 0.428099056379712, + "learning_rate": 1.9999923893706236e-05, + "loss": 0.1424, + "step": 1900 + }, + { + "epoch": 0.30656235031135237, + "grad_norm": 0.47981664372403626, + "learning_rate": 1.9999709899719893e-05, + "loss": 0.1414, + "step": 1920 + }, + { + "epoch": 0.30975570812709563, + "grad_norm": 0.4495236935209742, + "learning_rate": 1.9999357848418547e-05, + "loss": 0.1432, + "step": 1940 + }, + { + "epoch": 0.3129490659428389, + "grad_norm": 0.4335827442743115, + "learning_rate": 1.999886774466267e-05, + "loss": 0.1449, + "step": 1960 + }, + { + "epoch": 0.31614242375858215, + "grad_norm": 0.3740214770732922, + "learning_rate": 1.9998239595218693e-05, + "loss": 0.1455, + "step": 1980 + }, + { + "epoch": 0.3193357815743254, + "grad_norm": 0.35431822792110484, + "learning_rate": 1.999747340875894e-05, + "loss": 0.14, + "step": 2000 + }, + { + "epoch": 0.32252913939006866, + "grad_norm": 0.37271856793106084, + "learning_rate": 1.9996569195861474e-05, + "loss": 0.1433, + "step": 2020 + }, + { + "epoch": 0.3257224972058119, + "grad_norm": 0.36904721824612496, + "learning_rate": 1.999552696900998e-05, + "loss": 0.1474, + "step": 2040 + }, + { + "epoch": 0.3289158550215552, + "grad_norm": 0.4329302625174645, + "learning_rate": 1.9994346742593577e-05, + "loss": 0.1409, + "step": 2060 + }, + { + "epoch": 0.33210921283729844, + "grad_norm": 0.4659341494260738, + "learning_rate": 1.999302853290663e-05, + "loss": 0.1453, + "step": 2080 + }, + { + "epoch": 0.3353025706530417, + "grad_norm": 0.40127911103988617, + "learning_rate": 1.9991572358148522e-05, + "loss": 0.1396, + "step": 2100 + }, + { + "epoch": 0.3384959284687849, + "grad_norm": 0.3087442313177786, + "learning_rate": 1.9989978238423383e-05, + "loss": 0.1474, + "step": 2120 + }, + { + "epoch": 0.34168928628452816, + "grad_norm": 0.37193584969289195, + "learning_rate": 1.9988246195739846e-05, + "loss": 0.1422, + "step": 2140 + }, + { + "epoch": 0.3448826441002714, + "grad_norm": 0.3814913494874711, + "learning_rate": 1.998637625401072e-05, + "loss": 0.1422, + "step": 2160 + }, + { + "epoch": 0.3480760019160147, + "grad_norm": 0.34898690836821306, + "learning_rate": 1.9984368439052668e-05, + "loss": 0.1396, + "step": 2180 + }, + { + "epoch": 0.35126935973175794, + "grad_norm": 0.3951747534549505, + "learning_rate": 1.9982222778585845e-05, + "loss": 0.1458, + "step": 2200 + }, + { + "epoch": 0.3544627175475012, + "grad_norm": 0.34562618895160807, + "learning_rate": 1.9979939302233524e-05, + "loss": 0.1402, + "step": 2220 + }, + { + "epoch": 0.35765607536324445, + "grad_norm": 0.388573729018997, + "learning_rate": 1.9977518041521683e-05, + "loss": 0.1402, + "step": 2240 + }, + { + "epoch": 0.3608494331789877, + "grad_norm": 0.37200937634013864, + "learning_rate": 1.9974959029878568e-05, + "loss": 0.1438, + "step": 2260 + }, + { + "epoch": 0.36404279099473097, + "grad_norm": 0.3965761915716373, + "learning_rate": 1.9972262302634228e-05, + "loss": 0.1401, + "step": 2280 + }, + { + "epoch": 0.3672361488104742, + "grad_norm": 0.3173011648096044, + "learning_rate": 1.996942789702004e-05, + "loss": 0.1392, + "step": 2300 + }, + { + "epoch": 0.3704295066262175, + "grad_norm": 0.3017733588197737, + "learning_rate": 1.996645585216818e-05, + "loss": 0.1424, + "step": 2320 + }, + { + "epoch": 0.37362286444196074, + "grad_norm": 0.3363017850364413, + "learning_rate": 1.9963346209111084e-05, + "loss": 0.1396, + "step": 2340 + }, + { + "epoch": 0.376816222257704, + "grad_norm": 0.36015352029461045, + "learning_rate": 1.9960099010780906e-05, + "loss": 0.1364, + "step": 2360 + }, + { + "epoch": 0.3800095800734472, + "grad_norm": 0.3944315476618534, + "learning_rate": 1.995671430200889e-05, + "loss": 0.1367, + "step": 2380 + }, + { + "epoch": 0.38320293788919046, + "grad_norm": 0.3511161782236592, + "learning_rate": 1.9953192129524774e-05, + "loss": 0.134, + "step": 2400 + }, + { + "epoch": 0.3863962957049337, + "grad_norm": 0.28417767585244963, + "learning_rate": 1.994953254195613e-05, + "loss": 0.1345, + "step": 2420 + }, + { + "epoch": 0.389589653520677, + "grad_norm": 0.33674300015583525, + "learning_rate": 1.9945735589827714e-05, + "loss": 0.1414, + "step": 2440 + }, + { + "epoch": 0.39278301133642024, + "grad_norm": 0.35738285989377994, + "learning_rate": 1.9941801325560748e-05, + "loss": 0.1379, + "step": 2460 + }, + { + "epoch": 0.3959763691521635, + "grad_norm": 0.3281943522856012, + "learning_rate": 1.9937729803472198e-05, + "loss": 0.1377, + "step": 2480 + }, + { + "epoch": 0.39916972696790676, + "grad_norm": 0.45986080956623454, + "learning_rate": 1.9933521079774043e-05, + "loss": 0.1375, + "step": 2500 + }, + { + "epoch": 0.40236308478365, + "grad_norm": 0.3345998745429948, + "learning_rate": 1.9929175212572473e-05, + "loss": 0.1376, + "step": 2520 + }, + { + "epoch": 0.4055564425993933, + "grad_norm": 0.3589364581224636, + "learning_rate": 1.9924692261867107e-05, + "loss": 0.136, + "step": 2540 + }, + { + "epoch": 0.40874980041513653, + "grad_norm": 0.3214892613419847, + "learning_rate": 1.9920072289550152e-05, + "loss": 0.1375, + "step": 2560 + }, + { + "epoch": 0.4119431582308798, + "grad_norm": 0.293317969518762, + "learning_rate": 1.9915315359405556e-05, + "loss": 0.1396, + "step": 2580 + }, + { + "epoch": 0.41513651604662305, + "grad_norm": 0.30656393646036983, + "learning_rate": 1.9910421537108124e-05, + "loss": 0.1417, + "step": 2600 + }, + { + "epoch": 0.41832987386236625, + "grad_norm": 0.3145662089610958, + "learning_rate": 1.990539089022262e-05, + "loss": 0.1361, + "step": 2620 + }, + { + "epoch": 0.4215232316781095, + "grad_norm": 0.3321283029025178, + "learning_rate": 1.9900223488202807e-05, + "loss": 0.1374, + "step": 2640 + }, + { + "epoch": 0.42471658949385277, + "grad_norm": 0.3121994972039942, + "learning_rate": 1.9894919402390527e-05, + "loss": 0.1369, + "step": 2660 + }, + { + "epoch": 0.42790994730959603, + "grad_norm": 0.3207245571484729, + "learning_rate": 1.9889478706014687e-05, + "loss": 0.1365, + "step": 2680 + }, + { + "epoch": 0.4311033051253393, + "grad_norm": 0.3214891191577016, + "learning_rate": 1.9883901474190258e-05, + "loss": 0.134, + "step": 2700 + }, + { + "epoch": 0.43429666294108255, + "grad_norm": 0.2948703624055813, + "learning_rate": 1.9878187783917246e-05, + "loss": 0.1358, + "step": 2720 + }, + { + "epoch": 0.4374900207568258, + "grad_norm": 0.2962758480800377, + "learning_rate": 1.9872337714079604e-05, + "loss": 0.1353, + "step": 2740 + }, + { + "epoch": 0.44068337857256906, + "grad_norm": 0.28750701401056433, + "learning_rate": 1.9866351345444172e-05, + "loss": 0.1397, + "step": 2760 + }, + { + "epoch": 0.4438767363883123, + "grad_norm": 0.33961478398235684, + "learning_rate": 1.9860228760659547e-05, + "loss": 0.1395, + "step": 2780 + }, + { + "epoch": 0.4470700942040556, + "grad_norm": 0.3327701106038422, + "learning_rate": 1.9853970044254942e-05, + "loss": 0.1362, + "step": 2800 + }, + { + "epoch": 0.45026345201979884, + "grad_norm": 0.34046480079798697, + "learning_rate": 1.9847575282639022e-05, + "loss": 0.1357, + "step": 2820 + }, + { + "epoch": 0.4534568098355421, + "grad_norm": 0.2591827835319709, + "learning_rate": 1.984104456409871e-05, + "loss": 0.1319, + "step": 2840 + }, + { + "epoch": 0.4566501676512853, + "grad_norm": 0.31099418106495114, + "learning_rate": 1.983437797879797e-05, + "loss": 0.134, + "step": 2860 + }, + { + "epoch": 0.45984352546702856, + "grad_norm": 0.34942376213984855, + "learning_rate": 1.9827575618776556e-05, + "loss": 0.1353, + "step": 2880 + }, + { + "epoch": 0.4630368832827718, + "grad_norm": 0.29857742338407706, + "learning_rate": 1.9820637577948746e-05, + "loss": 0.1336, + "step": 2900 + }, + { + "epoch": 0.4662302410985151, + "grad_norm": 0.2701149477986023, + "learning_rate": 1.9813563952102056e-05, + "loss": 0.1338, + "step": 2920 + }, + { + "epoch": 0.46942359891425833, + "grad_norm": 0.35582085328111446, + "learning_rate": 1.980635483889589e-05, + "loss": 0.1325, + "step": 2940 + }, + { + "epoch": 0.4726169567300016, + "grad_norm": 0.36536478089468427, + "learning_rate": 1.979901033786022e-05, + "loss": 0.138, + "step": 2960 + }, + { + "epoch": 0.47581031454574485, + "grad_norm": 0.34482414871566835, + "learning_rate": 1.9791530550394197e-05, + "loss": 0.14, + "step": 2980 + }, + { + "epoch": 0.4790036723614881, + "grad_norm": 0.313925122152452, + "learning_rate": 1.9783915579764755e-05, + "loss": 0.1349, + "step": 3000 + }, + { + "epoch": 0.48219703017723137, + "grad_norm": 0.33065001108381514, + "learning_rate": 1.9776165531105182e-05, + "loss": 0.1334, + "step": 3020 + }, + { + "epoch": 0.4853903879929746, + "grad_norm": 0.33106961743791363, + "learning_rate": 1.9768280511413676e-05, + "loss": 0.1346, + "step": 3040 + }, + { + "epoch": 0.4885837458087179, + "grad_norm": 0.3038455922499442, + "learning_rate": 1.9760260629551856e-05, + "loss": 0.13, + "step": 3060 + }, + { + "epoch": 0.49177710362446114, + "grad_norm": 0.32774568750571736, + "learning_rate": 1.975210599624327e-05, + "loss": 0.1317, + "step": 3080 + }, + { + "epoch": 0.4949704614402044, + "grad_norm": 0.27913297393743014, + "learning_rate": 1.9743816724071864e-05, + "loss": 0.1299, + "step": 3100 + }, + { + "epoch": 0.4981638192559476, + "grad_norm": 0.25535801906865635, + "learning_rate": 1.9735392927480425e-05, + "loss": 0.1341, + "step": 3120 + }, + { + "epoch": 0.5013571770716909, + "grad_norm": 0.3450201878469047, + "learning_rate": 1.9726834722768998e-05, + "loss": 0.1307, + "step": 3140 + }, + { + "epoch": 0.5045505348874342, + "grad_norm": 0.3355377854047922, + "learning_rate": 1.9718142228093286e-05, + "loss": 0.1373, + "step": 3160 + }, + { + "epoch": 0.5077438927031774, + "grad_norm": 0.29501763605746917, + "learning_rate": 1.9709315563463022e-05, + "loss": 0.1329, + "step": 3180 + }, + { + "epoch": 0.5109372505189207, + "grad_norm": 0.29498443847446687, + "learning_rate": 1.9700354850740305e-05, + "loss": 0.1302, + "step": 3200 + }, + { + "epoch": 0.514130608334664, + "grad_norm": 0.3374549804904556, + "learning_rate": 1.969126021363791e-05, + "loss": 0.1332, + "step": 3220 + }, + { + "epoch": 0.5173239661504071, + "grad_norm": 0.2937151476643792, + "learning_rate": 1.9682031777717602e-05, + "loss": 0.1289, + "step": 3240 + }, + { + "epoch": 0.5205173239661504, + "grad_norm": 0.34027338318157424, + "learning_rate": 1.9672669670388387e-05, + "loss": 0.1335, + "step": 3260 + }, + { + "epoch": 0.5237106817818936, + "grad_norm": 0.2958186800446919, + "learning_rate": 1.966317402090475e-05, + "loss": 0.1321, + "step": 3280 + }, + { + "epoch": 0.5269040395976369, + "grad_norm": 0.2937191900726174, + "learning_rate": 1.9653544960364886e-05, + "loss": 0.132, + "step": 3300 + }, + { + "epoch": 0.5300973974133801, + "grad_norm": 0.3133245335540435, + "learning_rate": 1.9643782621708875e-05, + "loss": 0.1311, + "step": 3320 + }, + { + "epoch": 0.5332907552291234, + "grad_norm": 0.29304130620982005, + "learning_rate": 1.963388713971685e-05, + "loss": 0.1355, + "step": 3340 + }, + { + "epoch": 0.5364841130448667, + "grad_norm": 0.31292116477262744, + "learning_rate": 1.962385865100715e-05, + "loss": 0.1351, + "step": 3360 + }, + { + "epoch": 0.5396774708606099, + "grad_norm": 0.26493353801679925, + "learning_rate": 1.9613697294034403e-05, + "loss": 0.1315, + "step": 3380 + }, + { + "epoch": 0.5428708286763532, + "grad_norm": 0.2630906644626646, + "learning_rate": 1.9603403209087655e-05, + "loss": 0.1312, + "step": 3400 + }, + { + "epoch": 0.5460641864920964, + "grad_norm": 0.26085033107366945, + "learning_rate": 1.9592976538288392e-05, + "loss": 0.1296, + "step": 3420 + }, + { + "epoch": 0.5492575443078397, + "grad_norm": 0.2940107915040809, + "learning_rate": 1.9582417425588615e-05, + "loss": 0.1305, + "step": 3440 + }, + { + "epoch": 0.5524509021235829, + "grad_norm": 0.2648782222390229, + "learning_rate": 1.9571726016768825e-05, + "loss": 0.1298, + "step": 3460 + }, + { + "epoch": 0.5556442599393262, + "grad_norm": 0.25767016449009617, + "learning_rate": 1.9560902459436027e-05, + "loss": 0.1287, + "step": 3480 + }, + { + "epoch": 0.5588376177550695, + "grad_norm": 0.304832191804209, + "learning_rate": 1.9549946903021676e-05, + "loss": 0.1335, + "step": 3500 + }, + { + "epoch": 0.5620309755708127, + "grad_norm": 0.2814622371172937, + "learning_rate": 1.953885949877963e-05, + "loss": 0.1287, + "step": 3520 + }, + { + "epoch": 0.565224333386556, + "grad_norm": 0.27565323140470793, + "learning_rate": 1.9527640399784066e-05, + "loss": 0.132, + "step": 3540 + }, + { + "epoch": 0.5684176912022992, + "grad_norm": 0.2874659718873625, + "learning_rate": 1.9516289760927337e-05, + "loss": 0.1306, + "step": 3560 + }, + { + "epoch": 0.5716110490180425, + "grad_norm": 0.24637256127265056, + "learning_rate": 1.9504807738917864e-05, + "loss": 0.1294, + "step": 3580 + }, + { + "epoch": 0.5748044068337858, + "grad_norm": 0.2683166797652062, + "learning_rate": 1.949319449227796e-05, + "loss": 0.1265, + "step": 3600 + }, + { + "epoch": 0.577997764649529, + "grad_norm": 0.2991655914571407, + "learning_rate": 1.9481450181341636e-05, + "loss": 0.1307, + "step": 3620 + }, + { + "epoch": 0.5811911224652723, + "grad_norm": 0.2629061135815468, + "learning_rate": 1.9469574968252405e-05, + "loss": 0.131, + "step": 3640 + }, + { + "epoch": 0.5843844802810155, + "grad_norm": 0.30352941453895776, + "learning_rate": 1.9457569016961025e-05, + "loss": 0.1315, + "step": 3660 + }, + { + "epoch": 0.5875778380967588, + "grad_norm": 0.32189790257315865, + "learning_rate": 1.9445432493223243e-05, + "loss": 0.1301, + "step": 3680 + }, + { + "epoch": 0.590771195912502, + "grad_norm": 0.2262924484205468, + "learning_rate": 1.943316556459751e-05, + "loss": 0.1265, + "step": 3700 + }, + { + "epoch": 0.5939645537282453, + "grad_norm": 0.2711892071402863, + "learning_rate": 1.9420768400442657e-05, + "loss": 0.1271, + "step": 3720 + }, + { + "epoch": 0.5971579115439885, + "grad_norm": 0.256185445894437, + "learning_rate": 1.9408241171915576e-05, + "loss": 0.1277, + "step": 3740 + }, + { + "epoch": 0.6003512693597317, + "grad_norm": 0.25593240031460607, + "learning_rate": 1.9395584051968833e-05, + "loss": 0.1287, + "step": 3760 + }, + { + "epoch": 0.603544627175475, + "grad_norm": 0.2979762688925845, + "learning_rate": 1.9382797215348303e-05, + "loss": 0.1287, + "step": 3780 + }, + { + "epoch": 0.6067379849912182, + "grad_norm": 0.2761523818504427, + "learning_rate": 1.936988083859073e-05, + "loss": 0.1289, + "step": 3800 + }, + { + "epoch": 0.6099313428069615, + "grad_norm": 0.31322754272354847, + "learning_rate": 1.935683510002133e-05, + "loss": 0.1289, + "step": 3820 + }, + { + "epoch": 0.6131247006227047, + "grad_norm": 0.32118979161692, + "learning_rate": 1.934366017975128e-05, + "loss": 0.1291, + "step": 3840 + }, + { + "epoch": 0.616318058438448, + "grad_norm": 0.3965221736956701, + "learning_rate": 1.9330356259675277e-05, + "loss": 0.1291, + "step": 3860 + }, + { + "epoch": 0.6195114162541913, + "grad_norm": 0.23124317796079472, + "learning_rate": 1.9316923523468988e-05, + "loss": 0.127, + "step": 3880 + }, + { + "epoch": 0.6227047740699345, + "grad_norm": 0.26107711518189003, + "learning_rate": 1.9303362156586554e-05, + "loss": 0.1267, + "step": 3900 + }, + { + "epoch": 0.6258981318856778, + "grad_norm": 0.23776916842759366, + "learning_rate": 1.9289672346257988e-05, + "loss": 0.1246, + "step": 3920 + }, + { + "epoch": 0.629091489701421, + "grad_norm": 0.26149208748799935, + "learning_rate": 1.9275854281486626e-05, + "loss": 0.1251, + "step": 3940 + }, + { + "epoch": 0.6322848475171643, + "grad_norm": 0.2488232391306922, + "learning_rate": 1.9261908153046485e-05, + "loss": 0.1268, + "step": 3960 + }, + { + "epoch": 0.6354782053329076, + "grad_norm": 0.2541408784103856, + "learning_rate": 1.924783415347966e-05, + "loss": 0.1271, + "step": 3980 + }, + { + "epoch": 0.6386715631486508, + "grad_norm": 0.2731460017360242, + "learning_rate": 1.9233632477093655e-05, + "loss": 0.1255, + "step": 4000 + }, + { + "epoch": 0.6418649209643941, + "grad_norm": 0.22543278383772555, + "learning_rate": 1.9219303319958675e-05, + "loss": 0.1252, + "step": 4020 + }, + { + "epoch": 0.6450582787801373, + "grad_norm": 0.3043429192344086, + "learning_rate": 1.9204846879904966e-05, + "loss": 0.1261, + "step": 4040 + }, + { + "epoch": 0.6482516365958806, + "grad_norm": 0.2787988968661325, + "learning_rate": 1.9190263356520044e-05, + "loss": 0.1285, + "step": 4060 + }, + { + "epoch": 0.6514449944116238, + "grad_norm": 0.28334459179072036, + "learning_rate": 1.9175552951145953e-05, + "loss": 0.1312, + "step": 4080 + }, + { + "epoch": 0.6546383522273671, + "grad_norm": 0.2699681672265312, + "learning_rate": 1.91607158668765e-05, + "loss": 0.128, + "step": 4100 + }, + { + "epoch": 0.6578317100431104, + "grad_norm": 0.2653884535783852, + "learning_rate": 1.9145752308554422e-05, + "loss": 0.1236, + "step": 4120 + }, + { + "epoch": 0.6610250678588536, + "grad_norm": 0.24370062543889062, + "learning_rate": 1.913066248276859e-05, + "loss": 0.1267, + "step": 4140 + }, + { + "epoch": 0.6642184256745969, + "grad_norm": 0.268136522123535, + "learning_rate": 1.911544659785112e-05, + "loss": 0.1251, + "step": 4160 + }, + { + "epoch": 0.6674117834903401, + "grad_norm": 0.2804716904650792, + "learning_rate": 1.9100104863874535e-05, + "loss": 0.1282, + "step": 4180 + }, + { + "epoch": 0.6706051413060834, + "grad_norm": 0.25256532175596885, + "learning_rate": 1.9084637492648834e-05, + "loss": 0.1291, + "step": 4200 + }, + { + "epoch": 0.6737984991218267, + "grad_norm": 0.20654144385500267, + "learning_rate": 1.9069044697718596e-05, + "loss": 0.1275, + "step": 4220 + }, + { + "epoch": 0.6769918569375698, + "grad_norm": 0.3170119063036349, + "learning_rate": 1.9053326694359996e-05, + "loss": 0.1252, + "step": 4240 + }, + { + "epoch": 0.6801852147533131, + "grad_norm": 0.2518310103396095, + "learning_rate": 1.9037483699577866e-05, + "loss": 0.1252, + "step": 4260 + }, + { + "epoch": 0.6833785725690563, + "grad_norm": 0.24576567016775977, + "learning_rate": 1.9021515932102687e-05, + "loss": 0.1262, + "step": 4280 + }, + { + "epoch": 0.6865719303847996, + "grad_norm": 0.2272326194356311, + "learning_rate": 1.9005423612387564e-05, + "loss": 0.1277, + "step": 4300 + }, + { + "epoch": 0.6897652882005428, + "grad_norm": 0.2241851322819629, + "learning_rate": 1.8989206962605183e-05, + "loss": 0.1254, + "step": 4320 + }, + { + "epoch": 0.6929586460162861, + "grad_norm": 0.28963794959769024, + "learning_rate": 1.8972866206644756e-05, + "loss": 0.1269, + "step": 4340 + }, + { + "epoch": 0.6961520038320294, + "grad_norm": 0.27244182001640865, + "learning_rate": 1.8956401570108918e-05, + "loss": 0.1268, + "step": 4360 + }, + { + "epoch": 0.6993453616477726, + "grad_norm": 0.23505587827589292, + "learning_rate": 1.893981328031061e-05, + "loss": 0.128, + "step": 4380 + }, + { + "epoch": 0.7025387194635159, + "grad_norm": 0.2636460746314892, + "learning_rate": 1.8923101566269956e-05, + "loss": 0.1268, + "step": 4400 + }, + { + "epoch": 0.7057320772792591, + "grad_norm": 0.287020211559549, + "learning_rate": 1.890626665871108e-05, + "loss": 0.1251, + "step": 4420 + }, + { + "epoch": 0.7089254350950024, + "grad_norm": 0.3666813610337495, + "learning_rate": 1.8889308790058944e-05, + "loss": 0.122, + "step": 4440 + }, + { + "epoch": 0.7121187929107456, + "grad_norm": 0.24200632888509, + "learning_rate": 1.887222819443612e-05, + "loss": 0.1234, + "step": 4460 + }, + { + "epoch": 0.7153121507264889, + "grad_norm": 0.3142721600257018, + "learning_rate": 1.8855025107659565e-05, + "loss": 0.1247, + "step": 4480 + }, + { + "epoch": 0.7185055085422322, + "grad_norm": 0.2542404530052441, + "learning_rate": 1.8837699767237363e-05, + "loss": 0.1267, + "step": 4500 + }, + { + "epoch": 0.7216988663579754, + "grad_norm": 0.2513575844512111, + "learning_rate": 1.882025241236546e-05, + "loss": 0.1254, + "step": 4520 + }, + { + "epoch": 0.7248922241737187, + "grad_norm": 0.24131168314941073, + "learning_rate": 1.880268328392433e-05, + "loss": 0.1251, + "step": 4540 + }, + { + "epoch": 0.7280855819894619, + "grad_norm": 0.22534176261187136, + "learning_rate": 1.878499262447569e-05, + "loss": 0.1241, + "step": 4560 + }, + { + "epoch": 0.7312789398052052, + "grad_norm": 0.2812964686320165, + "learning_rate": 1.8767180678259113e-05, + "loss": 0.1257, + "step": 4580 + }, + { + "epoch": 0.7344722976209485, + "grad_norm": 0.23889076217882216, + "learning_rate": 1.874924769118868e-05, + "loss": 0.1273, + "step": 4600 + }, + { + "epoch": 0.7376656554366917, + "grad_norm": 0.27177520658222915, + "learning_rate": 1.873119391084958e-05, + "loss": 0.125, + "step": 4620 + }, + { + "epoch": 0.740859013252435, + "grad_norm": 0.21614884950104765, + "learning_rate": 1.8713019586494687e-05, + "loss": 0.1244, + "step": 4640 + }, + { + "epoch": 0.7440523710681782, + "grad_norm": 0.26972504495310423, + "learning_rate": 1.869472496904112e-05, + "loss": 0.1278, + "step": 4660 + }, + { + "epoch": 0.7472457288839215, + "grad_norm": 0.26832330471480753, + "learning_rate": 1.867631031106679e-05, + "loss": 0.1217, + "step": 4680 + }, + { + "epoch": 0.7504390866996647, + "grad_norm": 0.2204440405656476, + "learning_rate": 1.8657775866806885e-05, + "loss": 0.1226, + "step": 4700 + }, + { + "epoch": 0.753632444515408, + "grad_norm": 0.25422716012201274, + "learning_rate": 1.86391218921504e-05, + "loss": 0.1264, + "step": 4720 + }, + { + "epoch": 0.7568258023311512, + "grad_norm": 0.2334817914407686, + "learning_rate": 1.8620348644636572e-05, + "loss": 0.123, + "step": 4740 + }, + { + "epoch": 0.7600191601468944, + "grad_norm": 0.24736892038169794, + "learning_rate": 1.8601456383451325e-05, + "loss": 0.1245, + "step": 4760 + }, + { + "epoch": 0.7632125179626377, + "grad_norm": 0.23376798290995154, + "learning_rate": 1.8582445369423716e-05, + "loss": 0.1259, + "step": 4780 + }, + { + "epoch": 0.7664058757783809, + "grad_norm": 0.24649571282123517, + "learning_rate": 1.8563315865022318e-05, + "loss": 0.125, + "step": 4800 + }, + { + "epoch": 0.7695992335941242, + "grad_norm": 0.24228210919101548, + "learning_rate": 1.8544068134351585e-05, + "loss": 0.1225, + "step": 4820 + }, + { + "epoch": 0.7727925914098674, + "grad_norm": 0.25429442512742784, + "learning_rate": 1.852470244314824e-05, + "loss": 0.1261, + "step": 4840 + }, + { + "epoch": 0.7759859492256107, + "grad_norm": 0.2309045224127907, + "learning_rate": 1.850521905877756e-05, + "loss": 0.1249, + "step": 4860 + }, + { + "epoch": 0.779179307041354, + "grad_norm": 0.25672801367790765, + "learning_rate": 1.848561825022973e-05, + "loss": 0.1234, + "step": 4880 + }, + { + "epoch": 0.7823726648570972, + "grad_norm": 0.2473205806486083, + "learning_rate": 1.8465900288116098e-05, + "loss": 0.1284, + "step": 4900 + }, + { + "epoch": 0.7855660226728405, + "grad_norm": 0.3035165882865362, + "learning_rate": 1.844606544466545e-05, + "loss": 0.1237, + "step": 4920 + }, + { + "epoch": 0.7887593804885837, + "grad_norm": 0.26837139940976074, + "learning_rate": 1.8426113993720255e-05, + "loss": 0.1252, + "step": 4940 + }, + { + "epoch": 0.791952738304327, + "grad_norm": 0.26373147498792854, + "learning_rate": 1.840604621073288e-05, + "loss": 0.1227, + "step": 4960 + }, + { + "epoch": 0.7951460961200703, + "grad_norm": 0.2581673321881109, + "learning_rate": 1.8385862372761784e-05, + "loss": 0.1273, + "step": 4980 + }, + { + "epoch": 0.7983394539358135, + "grad_norm": 0.26439250344256154, + "learning_rate": 1.83655627584677e-05, + "loss": 0.1218, + "step": 5000 + }, + { + "epoch": 0.8015328117515568, + "grad_norm": 0.2816537144327537, + "learning_rate": 1.8345147648109784e-05, + "loss": 0.1263, + "step": 5020 + }, + { + "epoch": 0.8047261695673, + "grad_norm": 0.2647977758183829, + "learning_rate": 1.8324617323541738e-05, + "loss": 0.1238, + "step": 5040 + }, + { + "epoch": 0.8079195273830433, + "grad_norm": 0.2593258946289472, + "learning_rate": 1.830397206820794e-05, + "loss": 0.1246, + "step": 5060 + }, + { + "epoch": 0.8111128851987865, + "grad_norm": 0.22990124735756534, + "learning_rate": 1.8283212167139513e-05, + "loss": 0.1226, + "step": 5080 + }, + { + "epoch": 0.8143062430145298, + "grad_norm": 0.27455958743278586, + "learning_rate": 1.8262337906950385e-05, + "loss": 0.1261, + "step": 5100 + }, + { + "epoch": 0.8174996008302731, + "grad_norm": 0.2608809929482469, + "learning_rate": 1.8241349575833352e-05, + "loss": 0.1226, + "step": 5120 + }, + { + "epoch": 0.8206929586460163, + "grad_norm": 0.2640419564306298, + "learning_rate": 1.822024746355608e-05, + "loss": 0.1381, + "step": 5140 + }, + { + "epoch": 0.8238863164617596, + "grad_norm": 0.29262015087553245, + "learning_rate": 1.8199031861457123e-05, + "loss": 0.1214, + "step": 5160 + }, + { + "epoch": 0.8270796742775028, + "grad_norm": 0.2319619995331439, + "learning_rate": 1.8177703062441882e-05, + "loss": 0.1232, + "step": 5180 + }, + { + "epoch": 0.8302730320932461, + "grad_norm": 0.26293647732336844, + "learning_rate": 1.815626136097857e-05, + "loss": 0.1233, + "step": 5200 + }, + { + "epoch": 0.8334663899089892, + "grad_norm": 0.24081197327765444, + "learning_rate": 1.8134707053094146e-05, + "loss": 0.1202, + "step": 5220 + }, + { + "epoch": 0.8366597477247325, + "grad_norm": 0.2736597574126886, + "learning_rate": 1.8113040436370236e-05, + "loss": 0.1189, + "step": 5240 + }, + { + "epoch": 0.8398531055404758, + "grad_norm": 0.22867160064093073, + "learning_rate": 1.809126180993901e-05, + "loss": 0.1227, + "step": 5260 + }, + { + "epoch": 0.843046463356219, + "grad_norm": 0.20241019354224027, + "learning_rate": 1.8069371474479055e-05, + "loss": 0.1207, + "step": 5280 + }, + { + "epoch": 0.8462398211719623, + "grad_norm": 0.23512641329119113, + "learning_rate": 1.8047369732211236e-05, + "loss": 0.1227, + "step": 5300 + }, + { + "epoch": 0.8494331789877055, + "grad_norm": 0.21831678736014193, + "learning_rate": 1.8025256886894512e-05, + "loss": 0.1263, + "step": 5320 + }, + { + "epoch": 0.8526265368034488, + "grad_norm": 0.22942586598137038, + "learning_rate": 1.800303324382174e-05, + "loss": 0.1226, + "step": 5340 + }, + { + "epoch": 0.8558198946191921, + "grad_norm": 0.22565630953315605, + "learning_rate": 1.7980699109815476e-05, + "loss": 0.1227, + "step": 5360 + }, + { + "epoch": 0.8590132524349353, + "grad_norm": 0.2110233708822902, + "learning_rate": 1.795825479322372e-05, + "loss": 0.123, + "step": 5380 + }, + { + "epoch": 0.8622066102506786, + "grad_norm": 0.2588140422630483, + "learning_rate": 1.793570060391567e-05, + "loss": 0.1233, + "step": 5400 + }, + { + "epoch": 0.8653999680664218, + "grad_norm": 0.20643049269214508, + "learning_rate": 1.791303685327744e-05, + "loss": 0.1216, + "step": 5420 + }, + { + "epoch": 0.8685933258821651, + "grad_norm": 0.2450716780518527, + "learning_rate": 1.7890263854207766e-05, + "loss": 0.1187, + "step": 5440 + }, + { + "epoch": 0.8717866836979083, + "grad_norm": 0.2626908104568787, + "learning_rate": 1.7867381921113672e-05, + "loss": 0.1318, + "step": 5460 + }, + { + "epoch": 0.8749800415136516, + "grad_norm": 0.21046084433838286, + "learning_rate": 1.784439136990616e-05, + "loss": 0.1216, + "step": 5480 + }, + { + "epoch": 0.8781733993293949, + "grad_norm": 0.22390590052286838, + "learning_rate": 1.7821292517995802e-05, + "loss": 0.1222, + "step": 5500 + }, + { + "epoch": 0.8813667571451381, + "grad_norm": 0.21545360667161884, + "learning_rate": 1.7798085684288408e-05, + "loss": 0.1245, + "step": 5520 + }, + { + "epoch": 0.8845601149608814, + "grad_norm": 0.23969169247272867, + "learning_rate": 1.777477118918058e-05, + "loss": 0.1199, + "step": 5540 + }, + { + "epoch": 0.8877534727766246, + "grad_norm": 0.25616719983123853, + "learning_rate": 1.7751349354555315e-05, + "loss": 0.12, + "step": 5560 + }, + { + "epoch": 0.8909468305923679, + "grad_norm": 0.2327465548031593, + "learning_rate": 1.7727820503777563e-05, + "loss": 0.1188, + "step": 5580 + }, + { + "epoch": 0.8941401884081112, + "grad_norm": 0.2704312448776363, + "learning_rate": 1.770418496168973e-05, + "loss": 0.1266, + "step": 5600 + }, + { + "epoch": 0.8973335462238544, + "grad_norm": 0.280731488755357, + "learning_rate": 1.7680443054607247e-05, + "loss": 0.1186, + "step": 5620 + }, + { + "epoch": 0.9005269040395977, + "grad_norm": 0.2190704544630761, + "learning_rate": 1.7656595110314003e-05, + "loss": 0.1227, + "step": 5640 + }, + { + "epoch": 0.9037202618553409, + "grad_norm": 0.2676299073758256, + "learning_rate": 1.7632641458057874e-05, + "loss": 0.1166, + "step": 5660 + }, + { + "epoch": 0.9069136196710842, + "grad_norm": 0.2699663729747412, + "learning_rate": 1.7608582428546142e-05, + "loss": 0.1245, + "step": 5680 + }, + { + "epoch": 0.9101069774868275, + "grad_norm": 0.38105163760645616, + "learning_rate": 1.7584418353940943e-05, + "loss": 0.1218, + "step": 5700 + }, + { + "epoch": 0.9133003353025706, + "grad_norm": 0.23014658636555574, + "learning_rate": 1.756014956785468e-05, + "loss": 0.1181, + "step": 5720 + }, + { + "epoch": 0.9164936931183139, + "grad_norm": 0.24389786019248447, + "learning_rate": 1.7535776405345428e-05, + "loss": 0.1196, + "step": 5740 + }, + { + "epoch": 0.9196870509340571, + "grad_norm": 0.26113050468693977, + "learning_rate": 1.7511299202912275e-05, + "loss": 0.1202, + "step": 5760 + }, + { + "epoch": 0.9228804087498004, + "grad_norm": 0.2078740201372768, + "learning_rate": 1.7486718298490713e-05, + "loss": 0.124, + "step": 5780 + }, + { + "epoch": 0.9260737665655436, + "grad_norm": 0.3157327866928938, + "learning_rate": 1.7462034031447954e-05, + "loss": 0.1252, + "step": 5800 + }, + { + "epoch": 0.9292671243812869, + "grad_norm": 0.21114581099853116, + "learning_rate": 1.7437246742578246e-05, + "loss": 0.1204, + "step": 5820 + }, + { + "epoch": 0.9324604821970301, + "grad_norm": 0.2200062852329027, + "learning_rate": 1.7412356774098175e-05, + "loss": 0.1249, + "step": 5840 + }, + { + "epoch": 0.9356538400127734, + "grad_norm": 0.2739829354403811, + "learning_rate": 1.7387364469641928e-05, + "loss": 0.1207, + "step": 5860 + }, + { + "epoch": 0.9388471978285167, + "grad_norm": 0.22036300962797467, + "learning_rate": 1.736227017425656e-05, + "loss": 0.1182, + "step": 5880 + }, + { + "epoch": 0.9420405556442599, + "grad_norm": 0.2010246775840929, + "learning_rate": 1.7337074234397228e-05, + "loss": 0.1199, + "step": 5900 + }, + { + "epoch": 0.9452339134600032, + "grad_norm": 0.22961494443205888, + "learning_rate": 1.7311776997922404e-05, + "loss": 0.1207, + "step": 5920 + }, + { + "epoch": 0.9484272712757464, + "grad_norm": 0.26165957694875003, + "learning_rate": 1.7286378814089072e-05, + "loss": 0.1188, + "step": 5940 + }, + { + "epoch": 0.9516206290914897, + "grad_norm": 0.22131834255107544, + "learning_rate": 1.726088003354791e-05, + "loss": 0.1205, + "step": 5960 + }, + { + "epoch": 0.954813986907233, + "grad_norm": 0.2549539175287136, + "learning_rate": 1.7235281008338452e-05, + "loss": 0.1213, + "step": 5980 + }, + { + "epoch": 0.9580073447229762, + "grad_norm": 0.2427772520556814, + "learning_rate": 1.720958209188422e-05, + "loss": 0.1211, + "step": 6000 + }, + { + "epoch": 0.9612007025387195, + "grad_norm": 0.2442539895798861, + "learning_rate": 1.7183783638987845e-05, + "loss": 0.1193, + "step": 6020 + }, + { + "epoch": 0.9643940603544627, + "grad_norm": 0.23954523978335746, + "learning_rate": 1.7157886005826173e-05, + "loss": 0.1196, + "step": 6040 + }, + { + "epoch": 0.967587418170206, + "grad_norm": 0.20571373812832114, + "learning_rate": 1.7131889549945348e-05, + "loss": 0.1149, + "step": 6060 + }, + { + "epoch": 0.9707807759859493, + "grad_norm": 0.22749917178842363, + "learning_rate": 1.710579463025587e-05, + "loss": 0.1176, + "step": 6080 + }, + { + "epoch": 0.9739741338016925, + "grad_norm": 0.23012462875837292, + "learning_rate": 1.7079601607027643e-05, + "loss": 0.1186, + "step": 6100 + }, + { + "epoch": 0.9771674916174358, + "grad_norm": 0.20338632953694447, + "learning_rate": 1.7053310841885012e-05, + "loss": 0.1187, + "step": 6120 + }, + { + "epoch": 0.980360849433179, + "grad_norm": 0.23280208486194112, + "learning_rate": 1.7026922697801746e-05, + "loss": 0.1196, + "step": 6140 + }, + { + "epoch": 0.9835542072489223, + "grad_norm": 0.20786109950948006, + "learning_rate": 1.7000437539096046e-05, + "loss": 0.1202, + "step": 6160 + }, + { + "epoch": 0.9867475650646655, + "grad_norm": 0.21375986615043702, + "learning_rate": 1.6973855731425507e-05, + "loss": 0.1159, + "step": 6180 + }, + { + "epoch": 0.9899409228804088, + "grad_norm": 0.20748661803980806, + "learning_rate": 1.694717764178208e-05, + "loss": 0.1153, + "step": 6200 + }, + { + "epoch": 0.993134280696152, + "grad_norm": 0.22516009929996467, + "learning_rate": 1.692040363848699e-05, + "loss": 0.1204, + "step": 6220 + }, + { + "epoch": 0.9963276385118952, + "grad_norm": 0.2595564019615457, + "learning_rate": 1.6893534091185658e-05, + "loss": 0.1197, + "step": 6240 + }, + { + "epoch": 0.9995209963276385, + "grad_norm": 0.18297342882482412, + "learning_rate": 1.686656937084261e-05, + "loss": 0.1151, + "step": 6260 + }, + { + "epoch": 1.0027143541433818, + "grad_norm": 0.22852815920466457, + "learning_rate": 1.6839509849736326e-05, + "loss": 0.0949, + "step": 6280 + }, + { + "epoch": 1.005907711959125, + "grad_norm": 0.19728357385077158, + "learning_rate": 1.6812355901454132e-05, + "loss": 0.0872, + "step": 6300 + }, + { + "epoch": 1.0091010697748684, + "grad_norm": 0.2623149708691154, + "learning_rate": 1.678510790088702e-05, + "loss": 0.0887, + "step": 6320 + }, + { + "epoch": 1.0122944275906116, + "grad_norm": 0.18893451371595926, + "learning_rate": 1.6757766224224483e-05, + "loss": 0.0919, + "step": 6340 + }, + { + "epoch": 1.0154877854063549, + "grad_norm": 0.21837196710349846, + "learning_rate": 1.673033124894932e-05, + "loss": 0.0871, + "step": 6360 + }, + { + "epoch": 1.0186811432220981, + "grad_norm": 0.19258941847945746, + "learning_rate": 1.670280335383242e-05, + "loss": 0.0885, + "step": 6380 + }, + { + "epoch": 1.0218745010378414, + "grad_norm": 0.19005062378076065, + "learning_rate": 1.667518291892754e-05, + "loss": 0.0893, + "step": 6400 + }, + { + "epoch": 1.0250678588535846, + "grad_norm": 0.20663392660314553, + "learning_rate": 1.6647470325566045e-05, + "loss": 0.0891, + "step": 6420 + }, + { + "epoch": 1.028261216669328, + "grad_norm": 0.22234403999553295, + "learning_rate": 1.6619665956351664e-05, + "loss": 0.0881, + "step": 6440 + }, + { + "epoch": 1.0314545744850712, + "grad_norm": 0.2218548835051233, + "learning_rate": 1.6591770195155185e-05, + "loss": 0.0891, + "step": 6460 + }, + { + "epoch": 1.0346479323008142, + "grad_norm": 0.19448202424429442, + "learning_rate": 1.6563783427109173e-05, + "loss": 0.0882, + "step": 6480 + }, + { + "epoch": 1.0378412901165575, + "grad_norm": 0.2042849289860482, + "learning_rate": 1.6535706038602637e-05, + "loss": 0.0878, + "step": 6500 + }, + { + "epoch": 1.0410346479323007, + "grad_norm": 0.2512755796704539, + "learning_rate": 1.6507538417275716e-05, + "loss": 0.0875, + "step": 6520 + }, + { + "epoch": 1.044228005748044, + "grad_norm": 0.2131890646498463, + "learning_rate": 1.6479280952014304e-05, + "loss": 0.0898, + "step": 6540 + }, + { + "epoch": 1.0474213635637872, + "grad_norm": 0.21427122055121073, + "learning_rate": 1.6450934032944698e-05, + "loss": 0.088, + "step": 6560 + }, + { + "epoch": 1.0506147213795305, + "grad_norm": 0.2110500487102777, + "learning_rate": 1.64224980514282e-05, + "loss": 0.0877, + "step": 6580 + }, + { + "epoch": 1.0538080791952738, + "grad_norm": 0.21674633072630997, + "learning_rate": 1.6393973400055737e-05, + "loss": 0.0919, + "step": 6600 + }, + { + "epoch": 1.057001437011017, + "grad_norm": 0.20250013575431305, + "learning_rate": 1.63653604726424e-05, + "loss": 0.0878, + "step": 6620 + }, + { + "epoch": 1.0601947948267603, + "grad_norm": 0.22853386096908568, + "learning_rate": 1.6336659664222048e-05, + "loss": 0.0865, + "step": 6640 + }, + { + "epoch": 1.0633881526425035, + "grad_norm": 0.23371366704528887, + "learning_rate": 1.630787137104183e-05, + "loss": 0.0917, + "step": 6660 + }, + { + "epoch": 1.0665815104582468, + "grad_norm": 0.2520515744099512, + "learning_rate": 1.6278995990556725e-05, + "loss": 0.0885, + "step": 6680 + }, + { + "epoch": 1.06977486827399, + "grad_norm": 0.226518466734716, + "learning_rate": 1.6250033921424038e-05, + "loss": 0.089, + "step": 6700 + }, + { + "epoch": 1.0729682260897333, + "grad_norm": 0.19588721298026593, + "learning_rate": 1.6220985563497933e-05, + "loss": 0.0893, + "step": 6720 + }, + { + "epoch": 1.0761615839054766, + "grad_norm": 0.20545809450126928, + "learning_rate": 1.6191851317823864e-05, + "loss": 0.0878, + "step": 6740 + }, + { + "epoch": 1.0793549417212198, + "grad_norm": 0.19233602078710613, + "learning_rate": 1.6162631586633076e-05, + "loss": 0.0866, + "step": 6760 + }, + { + "epoch": 1.082548299536963, + "grad_norm": 0.16678814329219444, + "learning_rate": 1.6133326773337033e-05, + "loss": 0.0871, + "step": 6780 + }, + { + "epoch": 1.0857416573527063, + "grad_norm": 0.1872528998042832, + "learning_rate": 1.610393728252186e-05, + "loss": 0.0855, + "step": 6800 + }, + { + "epoch": 1.0889350151684496, + "grad_norm": 0.2125566089494784, + "learning_rate": 1.6074463519942747e-05, + "loss": 0.0868, + "step": 6820 + }, + { + "epoch": 1.0921283729841929, + "grad_norm": 0.2174911829451179, + "learning_rate": 1.604490589251835e-05, + "loss": 0.0883, + "step": 6840 + }, + { + "epoch": 1.095321730799936, + "grad_norm": 0.18461972367391402, + "learning_rate": 1.6015264808325172e-05, + "loss": 0.0866, + "step": 6860 + }, + { + "epoch": 1.0985150886156794, + "grad_norm": 0.21622527426814506, + "learning_rate": 1.5985540676591938e-05, + "loss": 0.0863, + "step": 6880 + }, + { + "epoch": 1.1017084464314226, + "grad_norm": 0.22055823564651658, + "learning_rate": 1.5955733907693938e-05, + "loss": 0.0864, + "step": 6900 + }, + { + "epoch": 1.1049018042471659, + "grad_norm": 0.21748955927958816, + "learning_rate": 1.592584491314735e-05, + "loss": 0.0914, + "step": 6920 + }, + { + "epoch": 1.1080951620629091, + "grad_norm": 0.19288286925997916, + "learning_rate": 1.589587410560359e-05, + "loss": 0.0886, + "step": 6940 + }, + { + "epoch": 1.1112885198786524, + "grad_norm": 0.22073550271753697, + "learning_rate": 1.586582189884357e-05, + "loss": 0.0874, + "step": 6960 + }, + { + "epoch": 1.1144818776943957, + "grad_norm": 0.19094293529375386, + "learning_rate": 1.5835688707772035e-05, + "loss": 0.0855, + "step": 6980 + }, + { + "epoch": 1.117675235510139, + "grad_norm": 0.21947645518408387, + "learning_rate": 1.5805474948411792e-05, + "loss": 0.0891, + "step": 7000 + }, + { + "epoch": 1.1208685933258822, + "grad_norm": 0.19228306320542188, + "learning_rate": 1.5775181037897995e-05, + "loss": 0.0864, + "step": 7020 + }, + { + "epoch": 1.1240619511416254, + "grad_norm": 0.2416878479220072, + "learning_rate": 1.5744807394472372e-05, + "loss": 0.0892, + "step": 7040 + }, + { + "epoch": 1.1272553089573687, + "grad_norm": 0.2763423491442259, + "learning_rate": 1.5714354437477454e-05, + "loss": 0.0903, + "step": 7060 + }, + { + "epoch": 1.130448666773112, + "grad_norm": 9.009817853561485, + "learning_rate": 1.568382258735078e-05, + "loss": 0.0896, + "step": 7080 + }, + { + "epoch": 1.1336420245888552, + "grad_norm": 0.21069452452749907, + "learning_rate": 1.5653212265619114e-05, + "loss": 0.0908, + "step": 7100 + }, + { + "epoch": 1.1368353824045985, + "grad_norm": 0.20407807891775565, + "learning_rate": 1.5622523894892587e-05, + "loss": 0.0908, + "step": 7120 + }, + { + "epoch": 1.1400287402203417, + "grad_norm": 0.2619102068507488, + "learning_rate": 1.5591757898858907e-05, + "loss": 0.0872, + "step": 7140 + }, + { + "epoch": 1.143222098036085, + "grad_norm": 0.20634106575751654, + "learning_rate": 1.556091470227747e-05, + "loss": 0.0875, + "step": 7160 + }, + { + "epoch": 1.1464154558518282, + "grad_norm": 0.23775033570197862, + "learning_rate": 1.5529994730973522e-05, + "loss": 0.0868, + "step": 7180 + }, + { + "epoch": 1.1496088136675715, + "grad_norm": 0.20245603598906314, + "learning_rate": 1.549899841183227e-05, + "loss": 0.0868, + "step": 7200 + }, + { + "epoch": 1.1528021714833148, + "grad_norm": 0.19815804657454472, + "learning_rate": 1.546792617279299e-05, + "loss": 0.0899, + "step": 7220 + }, + { + "epoch": 1.155995529299058, + "grad_norm": 0.18751806743751373, + "learning_rate": 1.5436778442843107e-05, + "loss": 0.0884, + "step": 7240 + }, + { + "epoch": 1.1591888871148013, + "grad_norm": 0.22312780655020503, + "learning_rate": 1.5405555652012302e-05, + "loss": 0.0895, + "step": 7260 + }, + { + "epoch": 1.1623822449305445, + "grad_norm": 0.1924743563793643, + "learning_rate": 1.5374258231366546e-05, + "loss": 0.0881, + "step": 7280 + }, + { + "epoch": 1.1655756027462878, + "grad_norm": 0.20844406290416265, + "learning_rate": 1.5342886613002155e-05, + "loss": 0.0867, + "step": 7300 + }, + { + "epoch": 1.168768960562031, + "grad_norm": 0.1761650680293785, + "learning_rate": 1.531144123003984e-05, + "loss": 0.087, + "step": 7320 + }, + { + "epoch": 1.1719623183777743, + "grad_norm": 0.1914806702266616, + "learning_rate": 1.5279922516618702e-05, + "loss": 0.0866, + "step": 7340 + }, + { + "epoch": 1.1751556761935176, + "grad_norm": 0.2112719185689836, + "learning_rate": 1.5248330907890272e-05, + "loss": 0.0867, + "step": 7360 + }, + { + "epoch": 1.1783490340092608, + "grad_norm": 0.20744289591360074, + "learning_rate": 1.5216666840012455e-05, + "loss": 0.0848, + "step": 7380 + }, + { + "epoch": 1.1815423918250039, + "grad_norm": 0.21602516707177483, + "learning_rate": 1.5184930750143565e-05, + "loss": 0.0889, + "step": 7400 + }, + { + "epoch": 1.1847357496407471, + "grad_norm": 0.1942180064010259, + "learning_rate": 1.515312307643624e-05, + "loss": 0.0871, + "step": 7420 + }, + { + "epoch": 1.1879291074564904, + "grad_norm": 0.1809045891368503, + "learning_rate": 1.5121244258031427e-05, + "loss": 0.0887, + "step": 7440 + }, + { + "epoch": 1.1911224652722336, + "grad_norm": 0.21509016663666897, + "learning_rate": 1.50892947350523e-05, + "loss": 0.0875, + "step": 7460 + }, + { + "epoch": 1.194315823087977, + "grad_norm": 0.22222425875493532, + "learning_rate": 1.5057274948598192e-05, + "loss": 0.0904, + "step": 7480 + }, + { + "epoch": 1.1975091809037202, + "grad_norm": 0.17436626344650585, + "learning_rate": 1.5025185340738499e-05, + "loss": 0.0869, + "step": 7500 + }, + { + "epoch": 1.2007025387194634, + "grad_norm": 0.2315956494531892, + "learning_rate": 1.4993026354506588e-05, + "loss": 0.0893, + "step": 7520 + }, + { + "epoch": 1.2038958965352067, + "grad_norm": 0.19438867498932094, + "learning_rate": 1.4960798433893664e-05, + "loss": 0.0898, + "step": 7540 + }, + { + "epoch": 1.20708925435095, + "grad_norm": 0.21507570120321423, + "learning_rate": 1.492850202384266e-05, + "loss": 0.0888, + "step": 7560 + }, + { + "epoch": 1.2102826121666932, + "grad_norm": 0.1756005064132717, + "learning_rate": 1.4896137570242068e-05, + "loss": 0.0886, + "step": 7580 + }, + { + "epoch": 1.2134759699824365, + "grad_norm": 0.21082827374254784, + "learning_rate": 1.486370551991981e-05, + "loss": 0.0877, + "step": 7600 + }, + { + "epoch": 1.2166693277981797, + "grad_norm": 0.25062287626591706, + "learning_rate": 1.483120632063706e-05, + "loss": 0.0889, + "step": 7620 + }, + { + "epoch": 1.219862685613923, + "grad_norm": 0.18123970615998264, + "learning_rate": 1.4798640421082047e-05, + "loss": 0.0886, + "step": 7640 + }, + { + "epoch": 1.2230560434296662, + "grad_norm": 0.21468260494577018, + "learning_rate": 1.4766008270863883e-05, + "loss": 0.0906, + "step": 7660 + }, + { + "epoch": 1.2262494012454095, + "grad_norm": 0.18876901647341507, + "learning_rate": 1.4733310320506343e-05, + "loss": 0.0882, + "step": 7680 + }, + { + "epoch": 1.2294427590611527, + "grad_norm": 0.19790235853542382, + "learning_rate": 1.4700547021441642e-05, + "loss": 0.0877, + "step": 7700 + }, + { + "epoch": 1.232636116876896, + "grad_norm": 0.18688689214473558, + "learning_rate": 1.4667718826004214e-05, + "loss": 0.0882, + "step": 7720 + }, + { + "epoch": 1.2358294746926393, + "grad_norm": 0.1951758945258833, + "learning_rate": 1.463482618742446e-05, + "loss": 0.0869, + "step": 7740 + }, + { + "epoch": 1.2390228325083825, + "grad_norm": 0.19995389074426362, + "learning_rate": 1.4601869559822488e-05, + "loss": 0.0872, + "step": 7760 + }, + { + "epoch": 1.2422161903241258, + "grad_norm": 0.2218492641305999, + "learning_rate": 1.4568849398201855e-05, + "loss": 0.0883, + "step": 7780 + }, + { + "epoch": 1.245409548139869, + "grad_norm": 0.18443852015389814, + "learning_rate": 1.4535766158443265e-05, + "loss": 0.087, + "step": 7800 + }, + { + "epoch": 1.2486029059556123, + "grad_norm": 0.19503753956864983, + "learning_rate": 1.45026202972983e-05, + "loss": 0.0885, + "step": 7820 + }, + { + "epoch": 1.2517962637713556, + "grad_norm": 0.19853902671151866, + "learning_rate": 1.446941227238309e-05, + "loss": 0.0861, + "step": 7840 + }, + { + "epoch": 1.2549896215870988, + "grad_norm": 0.21865153532249126, + "learning_rate": 1.4436142542172009e-05, + "loss": 0.0886, + "step": 7860 + }, + { + "epoch": 1.258182979402842, + "grad_norm": 0.20818634190936489, + "learning_rate": 1.4402811565991353e-05, + "loss": 0.0889, + "step": 7880 + }, + { + "epoch": 1.2613763372185853, + "grad_norm": 0.23080624800369903, + "learning_rate": 1.436941980401297e-05, + "loss": 0.0858, + "step": 7900 + }, + { + "epoch": 1.2645696950343286, + "grad_norm": 0.19862256058128666, + "learning_rate": 1.4335967717247941e-05, + "loss": 0.0865, + "step": 7920 + }, + { + "epoch": 1.2677630528500718, + "grad_norm": 0.18954472715597112, + "learning_rate": 1.4302455767540189e-05, + "loss": 0.0886, + "step": 7940 + }, + { + "epoch": 1.270956410665815, + "grad_norm": 0.18922957380652522, + "learning_rate": 1.4268884417560119e-05, + "loss": 0.0881, + "step": 7960 + }, + { + "epoch": 1.2741497684815584, + "grad_norm": 0.23661467243107595, + "learning_rate": 1.4235254130798213e-05, + "loss": 0.0884, + "step": 7980 + }, + { + "epoch": 1.2773431262973016, + "grad_norm": 0.21028360452170922, + "learning_rate": 1.4201565371558657e-05, + "loss": 0.0858, + "step": 8000 + }, + { + "epoch": 1.2805364841130449, + "grad_norm": 0.1857031394163611, + "learning_rate": 1.4167818604952906e-05, + "loss": 0.0865, + "step": 8020 + }, + { + "epoch": 1.2837298419287881, + "grad_norm": 0.227255800263239, + "learning_rate": 1.4134014296893275e-05, + "loss": 0.0884, + "step": 8040 + }, + { + "epoch": 1.2869231997445314, + "grad_norm": 0.26940362233973403, + "learning_rate": 1.4100152914086504e-05, + "loss": 0.0845, + "step": 8060 + }, + { + "epoch": 1.2901165575602747, + "grad_norm": 0.22762705633128913, + "learning_rate": 1.4066234924027318e-05, + "loss": 0.0863, + "step": 8080 + }, + { + "epoch": 1.293309915376018, + "grad_norm": 0.24522046661200322, + "learning_rate": 1.4032260794991956e-05, + "loss": 0.0854, + "step": 8100 + }, + { + "epoch": 1.2965032731917612, + "grad_norm": 0.17298541823238414, + "learning_rate": 1.3998230996031736e-05, + "loss": 0.0884, + "step": 8120 + }, + { + "epoch": 1.2996966310075044, + "grad_norm": 0.21973030306429478, + "learning_rate": 1.3964145996966555e-05, + "loss": 0.0879, + "step": 8140 + }, + { + "epoch": 1.3028899888232477, + "grad_norm": 0.18077115808310013, + "learning_rate": 1.3930006268378407e-05, + "loss": 0.089, + "step": 8160 + }, + { + "epoch": 1.306083346638991, + "grad_norm": 0.18437205616695954, + "learning_rate": 1.3895812281604895e-05, + "loss": 0.0887, + "step": 8180 + }, + { + "epoch": 1.3092767044547342, + "grad_norm": 0.22324698589088907, + "learning_rate": 1.386156450873271e-05, + "loss": 0.1099, + "step": 8200 + }, + { + "epoch": 1.3124700622704775, + "grad_norm": 0.1866174959700542, + "learning_rate": 1.382726342259113e-05, + "loss": 0.0899, + "step": 8220 + }, + { + "epoch": 1.3156634200862207, + "grad_norm": 0.22011208651394024, + "learning_rate": 1.3792909496745475e-05, + "loss": 0.0869, + "step": 8240 + }, + { + "epoch": 1.318856777901964, + "grad_norm": 0.21878645198323823, + "learning_rate": 1.3758503205490583e-05, + "loss": 0.0859, + "step": 8260 + }, + { + "epoch": 1.3220501357177072, + "grad_norm": 0.1869477105143079, + "learning_rate": 1.3724045023844253e-05, + "loss": 0.0898, + "step": 8280 + }, + { + "epoch": 1.3252434935334505, + "grad_norm": 0.21199782150015953, + "learning_rate": 1.3689535427540687e-05, + "loss": 0.0861, + "step": 8300 + }, + { + "epoch": 1.3284368513491938, + "grad_norm": 0.18518093738165986, + "learning_rate": 1.3654974893023934e-05, + "loss": 0.0908, + "step": 8320 + }, + { + "epoch": 1.331630209164937, + "grad_norm": 0.18688147397601756, + "learning_rate": 1.3620363897441289e-05, + "loss": 0.0868, + "step": 8340 + }, + { + "epoch": 1.3348235669806803, + "grad_norm": 0.2067483479178462, + "learning_rate": 1.358570291863673e-05, + "loss": 0.0884, + "step": 8360 + }, + { + "epoch": 1.3380169247964235, + "grad_norm": 0.21329007217550264, + "learning_rate": 1.3550992435144304e-05, + "loss": 0.086, + "step": 8380 + }, + { + "epoch": 1.3412102826121668, + "grad_norm": 0.18073209909106028, + "learning_rate": 1.3516232926181529e-05, + "loss": 0.0868, + "step": 8400 + }, + { + "epoch": 1.34440364042791, + "grad_norm": 0.23014446893395585, + "learning_rate": 1.3481424871642778e-05, + "loss": 0.088, + "step": 8420 + }, + { + "epoch": 1.3475969982436533, + "grad_norm": 0.3028280433486724, + "learning_rate": 1.3446568752092643e-05, + "loss": 0.0848, + "step": 8440 + }, + { + "epoch": 1.3507903560593966, + "grad_norm": 0.20888924306544646, + "learning_rate": 1.3411665048759313e-05, + "loss": 0.0885, + "step": 8460 + }, + { + "epoch": 1.3539837138751398, + "grad_norm": 0.22324045695426223, + "learning_rate": 1.3376714243527925e-05, + "loss": 0.0901, + "step": 8480 + }, + { + "epoch": 1.357177071690883, + "grad_norm": 0.19474459814659545, + "learning_rate": 1.3341716818933912e-05, + "loss": 0.088, + "step": 8500 + }, + { + "epoch": 1.3603704295066263, + "grad_norm": 0.22602725655780065, + "learning_rate": 1.3306673258156334e-05, + "loss": 0.0867, + "step": 8520 + }, + { + "epoch": 1.3635637873223696, + "grad_norm": 0.23360209320607728, + "learning_rate": 1.3271584045011217e-05, + "loss": 0.0886, + "step": 8540 + }, + { + "epoch": 1.3667571451381129, + "grad_norm": 0.1873427703628018, + "learning_rate": 1.3236449663944875e-05, + "loss": 0.0866, + "step": 8560 + }, + { + "epoch": 1.369950502953856, + "grad_norm": 0.1985433902478951, + "learning_rate": 1.3201270600027208e-05, + "loss": 0.0876, + "step": 8580 + }, + { + "epoch": 1.3731438607695992, + "grad_norm": 0.18896595210872472, + "learning_rate": 1.3166047338945019e-05, + "loss": 0.0861, + "step": 8600 + }, + { + "epoch": 1.3763372185853424, + "grad_norm": 0.22957720239257226, + "learning_rate": 1.3130780366995297e-05, + "loss": 0.0853, + "step": 8620 + }, + { + "epoch": 1.3795305764010857, + "grad_norm": 0.1933824287848287, + "learning_rate": 1.3095470171078512e-05, + "loss": 0.0867, + "step": 8640 + }, + { + "epoch": 1.382723934216829, + "grad_norm": 0.22324019535172776, + "learning_rate": 1.3060117238691894e-05, + "loss": 0.085, + "step": 8660 + }, + { + "epoch": 1.3859172920325722, + "grad_norm": 0.2316030267030887, + "learning_rate": 1.3024722057922696e-05, + "loss": 0.0841, + "step": 8680 + }, + { + "epoch": 1.3891106498483154, + "grad_norm": 0.1973247421696361, + "learning_rate": 1.2989285117441452e-05, + "loss": 0.0878, + "step": 8700 + }, + { + "epoch": 1.3923040076640587, + "grad_norm": 0.2080002656842217, + "learning_rate": 1.2953806906495244e-05, + "loss": 0.0883, + "step": 8720 + }, + { + "epoch": 1.395497365479802, + "grad_norm": 0.18517780070734782, + "learning_rate": 1.2918287914900933e-05, + "loss": 0.0852, + "step": 8740 + }, + { + "epoch": 1.3986907232955452, + "grad_norm": 0.19697224463698385, + "learning_rate": 1.2882728633038406e-05, + "loss": 0.0855, + "step": 8760 + }, + { + "epoch": 1.4018840811112885, + "grad_norm": 0.19736259450538857, + "learning_rate": 1.2847129551843807e-05, + "loss": 0.0876, + "step": 8780 + }, + { + "epoch": 1.4050774389270317, + "grad_norm": 0.18942542996017805, + "learning_rate": 1.2811491162802744e-05, + "loss": 0.0884, + "step": 8800 + }, + { + "epoch": 1.408270796742775, + "grad_norm": 0.19254196108878727, + "learning_rate": 1.277581395794353e-05, + "loss": 0.088, + "step": 8820 + }, + { + "epoch": 1.4114641545585183, + "grad_norm": 0.24282865106690285, + "learning_rate": 1.2740098429830357e-05, + "loss": 0.0891, + "step": 8840 + }, + { + "epoch": 1.4146575123742615, + "grad_norm": 0.23984915406072307, + "learning_rate": 1.2704345071556525e-05, + "loss": 0.0886, + "step": 8860 + }, + { + "epoch": 1.4178508701900048, + "grad_norm": 0.2184606228075661, + "learning_rate": 1.2668554376737619e-05, + "loss": 0.087, + "step": 8880 + }, + { + "epoch": 1.421044228005748, + "grad_norm": 0.19798737334853378, + "learning_rate": 1.2632726839504693e-05, + "loss": 0.0875, + "step": 8900 + }, + { + "epoch": 1.4242375858214913, + "grad_norm": 0.23442081669151446, + "learning_rate": 1.2596862954497458e-05, + "loss": 0.0849, + "step": 8920 + }, + { + "epoch": 1.4274309436372346, + "grad_norm": 0.21286909537115775, + "learning_rate": 1.2560963216857447e-05, + "loss": 0.0845, + "step": 8940 + }, + { + "epoch": 1.4306243014529778, + "grad_norm": 0.19037684375350825, + "learning_rate": 1.2525028122221172e-05, + "loss": 0.0857, + "step": 8960 + }, + { + "epoch": 1.433817659268721, + "grad_norm": 0.18725372186680364, + "learning_rate": 1.24890581667133e-05, + "loss": 0.0875, + "step": 8980 + }, + { + "epoch": 1.4370110170844643, + "grad_norm": 0.20844623553872596, + "learning_rate": 1.2453053846939783e-05, + "loss": 0.0898, + "step": 9000 + }, + { + "epoch": 1.4402043749002076, + "grad_norm": 0.21140506066201004, + "learning_rate": 1.2417015659981007e-05, + "loss": 0.0883, + "step": 9020 + }, + { + "epoch": 1.4433977327159508, + "grad_norm": 0.2064339774677841, + "learning_rate": 1.2380944103384946e-05, + "loss": 0.0849, + "step": 9040 + }, + { + "epoch": 1.446591090531694, + "grad_norm": 0.17652746458033255, + "learning_rate": 1.2344839675160271e-05, + "loss": 0.0867, + "step": 9060 + }, + { + "epoch": 1.4497844483474374, + "grad_norm": 0.19101046403484023, + "learning_rate": 1.2308702873769486e-05, + "loss": 0.0865, + "step": 9080 + }, + { + "epoch": 1.4529778061631806, + "grad_norm": 0.19778410360898788, + "learning_rate": 1.227253419812204e-05, + "loss": 0.0876, + "step": 9100 + }, + { + "epoch": 1.4561711639789239, + "grad_norm": 0.1884773288145621, + "learning_rate": 1.2236334147567442e-05, + "loss": 0.0873, + "step": 9120 + }, + { + "epoch": 1.4593645217946671, + "grad_norm": 0.22741564087867433, + "learning_rate": 1.2200103221888365e-05, + "loss": 0.0842, + "step": 9140 + }, + { + "epoch": 1.4625578796104104, + "grad_norm": 0.19382271044214394, + "learning_rate": 1.2163841921293761e-05, + "loss": 0.0846, + "step": 9160 + }, + { + "epoch": 1.4657512374261537, + "grad_norm": 0.2225438873976966, + "learning_rate": 1.2127550746411932e-05, + "loss": 0.086, + "step": 9180 + }, + { + "epoch": 1.468944595241897, + "grad_norm": 0.20309796630710175, + "learning_rate": 1.2091230198283626e-05, + "loss": 0.0872, + "step": 9200 + }, + { + "epoch": 1.4721379530576402, + "grad_norm": 0.21309103603253518, + "learning_rate": 1.2054880778355122e-05, + "loss": 0.0856, + "step": 9220 + }, + { + "epoch": 1.4753313108733834, + "grad_norm": 0.20007800804028458, + "learning_rate": 1.201850298847132e-05, + "loss": 0.0843, + "step": 9240 + }, + { + "epoch": 1.4785246686891267, + "grad_norm": 0.22102981325152446, + "learning_rate": 1.198209733086878e-05, + "loss": 0.0865, + "step": 9260 + }, + { + "epoch": 1.48171802650487, + "grad_norm": 0.2509432577302147, + "learning_rate": 1.194566430816882e-05, + "loss": 0.0872, + "step": 9280 + }, + { + "epoch": 1.4849113843206132, + "grad_norm": 0.21078643240774367, + "learning_rate": 1.1909204423370564e-05, + "loss": 0.0856, + "step": 9300 + }, + { + "epoch": 1.4881047421363562, + "grad_norm": 0.22252302888210984, + "learning_rate": 1.1872718179843994e-05, + "loss": 0.0838, + "step": 9320 + }, + { + "epoch": 1.4912980999520995, + "grad_norm": 0.18987560853570382, + "learning_rate": 1.1836206081323003e-05, + "loss": 0.085, + "step": 9340 + }, + { + "epoch": 1.4944914577678428, + "grad_norm": 0.19549774907184778, + "learning_rate": 1.1799668631898445e-05, + "loss": 0.0877, + "step": 9360 + }, + { + "epoch": 1.497684815583586, + "grad_norm": 0.19228104758868642, + "learning_rate": 1.176310633601117e-05, + "loss": 0.0956, + "step": 9380 + }, + { + "epoch": 1.5008781733993293, + "grad_norm": 0.20819820045783494, + "learning_rate": 1.1726519698445056e-05, + "loss": 0.0867, + "step": 9400 + }, + { + "epoch": 1.5040715312150725, + "grad_norm": 0.20733767509582143, + "learning_rate": 1.1689909224320062e-05, + "loss": 0.0863, + "step": 9420 + }, + { + "epoch": 1.5072648890308158, + "grad_norm": 0.20925265086202188, + "learning_rate": 1.165327541908522e-05, + "loss": 0.0861, + "step": 9440 + }, + { + "epoch": 1.510458246846559, + "grad_norm": 0.18493554321077676, + "learning_rate": 1.1616618788511684e-05, + "loss": 0.0849, + "step": 9460 + }, + { + "epoch": 1.5136516046623023, + "grad_norm": 0.18797864341732143, + "learning_rate": 1.1579939838685731e-05, + "loss": 0.085, + "step": 9480 + }, + { + "epoch": 1.5168449624780456, + "grad_norm": 0.2242101441050116, + "learning_rate": 1.154323907600179e-05, + "loss": 0.0867, + "step": 9500 + }, + { + "epoch": 1.5200383202937888, + "grad_norm": 0.17084103768025352, + "learning_rate": 1.1506517007155432e-05, + "loss": 0.0838, + "step": 9520 + }, + { + "epoch": 1.523231678109532, + "grad_norm": 0.18934207218377755, + "learning_rate": 1.1469774139136389e-05, + "loss": 0.0857, + "step": 9540 + }, + { + "epoch": 1.5264250359252753, + "grad_norm": 0.2265706734312821, + "learning_rate": 1.1433010979221545e-05, + "loss": 0.0866, + "step": 9560 + }, + { + "epoch": 1.5296183937410186, + "grad_norm": 0.22302910930406783, + "learning_rate": 1.1396228034967942e-05, + "loss": 0.0841, + "step": 9580 + }, + { + "epoch": 1.5328117515567619, + "grad_norm": 0.20180278303765992, + "learning_rate": 1.1359425814205767e-05, + "loss": 0.0863, + "step": 9600 + }, + { + "epoch": 1.5360051093725051, + "grad_norm": 0.22800639526769467, + "learning_rate": 1.132260482503133e-05, + "loss": 0.0873, + "step": 9620 + }, + { + "epoch": 1.5391984671882484, + "grad_norm": 0.21277101714684102, + "learning_rate": 1.1285765575800076e-05, + "loss": 0.0874, + "step": 9640 + }, + { + "epoch": 1.5423918250039916, + "grad_norm": 0.18816604414097163, + "learning_rate": 1.1248908575119539e-05, + "loss": 0.0862, + "step": 9660 + }, + { + "epoch": 1.545585182819735, + "grad_norm": 0.20138026843291984, + "learning_rate": 1.1212034331842338e-05, + "loss": 0.0856, + "step": 9680 + }, + { + "epoch": 1.5487785406354782, + "grad_norm": 0.18862474943057217, + "learning_rate": 1.1175143355059144e-05, + "loss": 0.085, + "step": 9700 + }, + { + "epoch": 1.5519718984512214, + "grad_norm": 0.18561382698856643, + "learning_rate": 1.1138236154091656e-05, + "loss": 0.0852, + "step": 9720 + }, + { + "epoch": 1.5551652562669647, + "grad_norm": 0.18884644793283215, + "learning_rate": 1.1101313238485552e-05, + "loss": 0.0839, + "step": 9740 + }, + { + "epoch": 1.558358614082708, + "grad_norm": 0.17345642894126198, + "learning_rate": 1.1064375118003487e-05, + "loss": 0.0844, + "step": 9760 + }, + { + "epoch": 1.5615519718984512, + "grad_norm": 0.1991026940192444, + "learning_rate": 1.1027422302618032e-05, + "loss": 0.0846, + "step": 9780 + }, + { + "epoch": 1.5647453297141944, + "grad_norm": 0.22201127220587602, + "learning_rate": 1.099045530250463e-05, + "loss": 0.0823, + "step": 9800 + }, + { + "epoch": 1.5679386875299377, + "grad_norm": 0.23679974637337212, + "learning_rate": 1.0953474628034562e-05, + "loss": 0.087, + "step": 9820 + }, + { + "epoch": 1.571132045345681, + "grad_norm": 0.18945453405405135, + "learning_rate": 1.0916480789767907e-05, + "loss": 0.0861, + "step": 9840 + }, + { + "epoch": 1.5743254031614242, + "grad_norm": 0.18943349755537386, + "learning_rate": 1.0879474298446479e-05, + "loss": 0.0831, + "step": 9860 + }, + { + "epoch": 1.5775187609771675, + "grad_norm": 0.20905996320818215, + "learning_rate": 1.0842455664986782e-05, + "loss": 0.0858, + "step": 9880 + }, + { + "epoch": 1.5807121187929107, + "grad_norm": 0.1863924849638652, + "learning_rate": 1.0805425400472956e-05, + "loss": 0.0856, + "step": 9900 + }, + { + "epoch": 1.583905476608654, + "grad_norm": 0.2091352813903984, + "learning_rate": 1.076838401614972e-05, + "loss": 0.0857, + "step": 9920 + }, + { + "epoch": 1.5870988344243973, + "grad_norm": 0.24371561015345014, + "learning_rate": 1.0731332023415319e-05, + "loss": 0.089, + "step": 9940 + }, + { + "epoch": 1.5902921922401405, + "grad_norm": 0.2128926213672918, + "learning_rate": 1.0694269933814456e-05, + "loss": 0.084, + "step": 9960 + }, + { + "epoch": 1.5934855500558838, + "grad_norm": 0.21916373322291655, + "learning_rate": 1.0657198259031232e-05, + "loss": 0.0826, + "step": 9980 + }, + { + "epoch": 1.596678907871627, + "grad_norm": 0.1824350216961259, + "learning_rate": 1.0620117510882083e-05, + "loss": 0.0864, + "step": 10000 + }, + { + "epoch": 1.5998722656873703, + "grad_norm": 0.17881824213547054, + "learning_rate": 1.058302820130871e-05, + "loss": 0.0839, + "step": 10020 + }, + { + "epoch": 1.6030656235031135, + "grad_norm": 0.19729212364378013, + "learning_rate": 1.0545930842371022e-05, + "loss": 0.0854, + "step": 10040 + }, + { + "epoch": 1.6062589813188568, + "grad_norm": 0.2087951067289451, + "learning_rate": 1.0508825946240053e-05, + "loss": 0.085, + "step": 10060 + }, + { + "epoch": 1.6094523391346, + "grad_norm": 0.19718155636666373, + "learning_rate": 1.0471714025190897e-05, + "loss": 0.0856, + "step": 10080 + }, + { + "epoch": 1.6126456969503433, + "grad_norm": 0.20228614287118912, + "learning_rate": 1.0434595591595635e-05, + "loss": 0.0853, + "step": 10100 + }, + { + "epoch": 1.6158390547660866, + "grad_norm": 0.18693736149298812, + "learning_rate": 1.0397471157916263e-05, + "loss": 0.0849, + "step": 10120 + }, + { + "epoch": 1.6190324125818298, + "grad_norm": 0.191886465159157, + "learning_rate": 1.0360341236697611e-05, + "loss": 0.0838, + "step": 10140 + }, + { + "epoch": 1.622225770397573, + "grad_norm": 0.20503489329364863, + "learning_rate": 1.0323206340560275e-05, + "loss": 0.0856, + "step": 10160 + }, + { + "epoch": 1.6254191282133164, + "grad_norm": 0.24260576208421464, + "learning_rate": 1.028606698219353e-05, + "loss": 0.0865, + "step": 10180 + }, + { + "epoch": 1.6286124860290596, + "grad_norm": 0.22639324906871056, + "learning_rate": 1.0248923674348268e-05, + "loss": 0.0859, + "step": 10200 + }, + { + "epoch": 1.6318058438448029, + "grad_norm": 0.176153514574258, + "learning_rate": 1.0211776929829893e-05, + "loss": 0.0867, + "step": 10220 + }, + { + "epoch": 1.6349992016605461, + "grad_norm": 0.1877599319113198, + "learning_rate": 1.0174627261491268e-05, + "loss": 0.0829, + "step": 10240 + }, + { + "epoch": 1.6381925594762894, + "grad_norm": 0.19998890698860952, + "learning_rate": 1.0137475182225617e-05, + "loss": 0.0841, + "step": 10260 + }, + { + "epoch": 1.6413859172920326, + "grad_norm": 0.21610758072730218, + "learning_rate": 1.0100321204959449e-05, + "loss": 0.0841, + "step": 10280 + }, + { + "epoch": 1.644579275107776, + "grad_norm": 0.18193308572064754, + "learning_rate": 1.0063165842645484e-05, + "loss": 0.0849, + "step": 10300 + }, + { + "epoch": 1.6477726329235192, + "grad_norm": 0.20248453381225345, + "learning_rate": 1.0026009608255555e-05, + "loss": 0.0845, + "step": 10320 + }, + { + "epoch": 1.6509659907392624, + "grad_norm": 0.2320711692298343, + "learning_rate": 9.988853014773542e-06, + "loss": 0.0852, + "step": 10340 + }, + { + "epoch": 1.6541593485550057, + "grad_norm": 0.19584194025576318, + "learning_rate": 9.951696575188278e-06, + "loss": 0.085, + "step": 10360 + }, + { + "epoch": 1.657352706370749, + "grad_norm": 0.21553283351451755, + "learning_rate": 9.914540802486474e-06, + "loss": 0.0856, + "step": 10380 + }, + { + "epoch": 1.6605460641864922, + "grad_norm": 0.18489531692862257, + "learning_rate": 9.877386209645633e-06, + "loss": 0.0858, + "step": 10400 + }, + { + "epoch": 1.6637394220022355, + "grad_norm": 0.20763123012361048, + "learning_rate": 9.84023330962697e-06, + "loss": 0.0852, + "step": 10420 + }, + { + "epoch": 1.6669327798179787, + "grad_norm": 0.19596129486655178, + "learning_rate": 9.803082615368323e-06, + "loss": 0.0835, + "step": 10440 + }, + { + "epoch": 1.670126137633722, + "grad_norm": 0.18532596629268455, + "learning_rate": 9.765934639777087e-06, + "loss": 0.0841, + "step": 10460 + }, + { + "epoch": 1.6733194954494652, + "grad_norm": 0.17232132477688097, + "learning_rate": 9.728789895723109e-06, + "loss": 0.0835, + "step": 10480 + }, + { + "epoch": 1.6765128532652085, + "grad_norm": 0.19279722790694026, + "learning_rate": 9.691648896031642e-06, + "loss": 0.0877, + "step": 10500 + }, + { + "epoch": 1.6797062110809517, + "grad_norm": 0.20508136042678382, + "learning_rate": 9.65451215347622e-06, + "loss": 0.0849, + "step": 10520 + }, + { + "epoch": 1.682899568896695, + "grad_norm": 0.2030859465238483, + "learning_rate": 9.61738018077162e-06, + "loss": 0.0828, + "step": 10540 + }, + { + "epoch": 1.6860929267124383, + "grad_norm": 0.21547506058080174, + "learning_rate": 9.580253490566753e-06, + "loss": 0.0837, + "step": 10560 + }, + { + "epoch": 1.6892862845281815, + "grad_norm": 0.22700049094169877, + "learning_rate": 9.543132595437612e-06, + "loss": 0.0849, + "step": 10580 + }, + { + "epoch": 1.6924796423439248, + "grad_norm": 0.19256501341459278, + "learning_rate": 9.506018007880169e-06, + "loss": 0.0845, + "step": 10600 + }, + { + "epoch": 1.695673000159668, + "grad_norm": 0.20795374910309583, + "learning_rate": 9.468910240303324e-06, + "loss": 0.0819, + "step": 10620 + }, + { + "epoch": 1.6988663579754113, + "grad_norm": 0.19009887752439592, + "learning_rate": 9.431809805021815e-06, + "loss": 0.0816, + "step": 10640 + }, + { + "epoch": 1.7020597157911546, + "grad_norm": 0.19799389204125842, + "learning_rate": 9.394717214249147e-06, + "loss": 0.0851, + "step": 10660 + }, + { + "epoch": 1.7052530736068978, + "grad_norm": 0.2271148145004972, + "learning_rate": 9.357632980090528e-06, + "loss": 0.0852, + "step": 10680 + }, + { + "epoch": 1.7084464314226409, + "grad_norm": 0.2344519009086231, + "learning_rate": 9.320557614535787e-06, + "loss": 0.0831, + "step": 10700 + }, + { + "epoch": 1.7116397892383841, + "grad_norm": 0.23925944266837562, + "learning_rate": 9.283491629452315e-06, + "loss": 0.0853, + "step": 10720 + }, + { + "epoch": 1.7148331470541274, + "grad_norm": 0.20081263527645807, + "learning_rate": 9.246435536577999e-06, + "loss": 0.085, + "step": 10740 + }, + { + "epoch": 1.7180265048698706, + "grad_norm": 0.20700627503253236, + "learning_rate": 9.20938984751415e-06, + "loss": 0.0851, + "step": 10760 + }, + { + "epoch": 1.7212198626856139, + "grad_norm": 0.201866541369534, + "learning_rate": 9.172355073718439e-06, + "loss": 0.0842, + "step": 10780 + }, + { + "epoch": 1.7244132205013571, + "grad_norm": 0.20280888722283474, + "learning_rate": 9.135331726497843e-06, + "loss": 0.0822, + "step": 10800 + }, + { + "epoch": 1.7276065783171004, + "grad_norm": 0.19344393506408358, + "learning_rate": 9.09832031700158e-06, + "loss": 0.0828, + "step": 10820 + }, + { + "epoch": 1.7307999361328437, + "grad_norm": 0.16624847494447237, + "learning_rate": 9.06132135621406e-06, + "loss": 0.0829, + "step": 10840 + }, + { + "epoch": 1.733993293948587, + "grad_norm": 0.21724180904413368, + "learning_rate": 9.024335354947812e-06, + "loss": 0.0838, + "step": 10860 + }, + { + "epoch": 1.7371866517643302, + "grad_norm": 0.23846515088949718, + "learning_rate": 8.987362823836461e-06, + "loss": 0.0852, + "step": 10880 + }, + { + "epoch": 1.7403800095800734, + "grad_norm": 0.20925184991512286, + "learning_rate": 8.950404273327646e-06, + "loss": 0.0834, + "step": 10900 + }, + { + "epoch": 1.7435733673958167, + "grad_norm": 0.1716514985288543, + "learning_rate": 8.913460213675998e-06, + "loss": 0.0836, + "step": 10920 + }, + { + "epoch": 1.74676672521156, + "grad_norm": 0.20494704853492865, + "learning_rate": 8.876531154936084e-06, + "loss": 0.0817, + "step": 10940 + }, + { + "epoch": 1.7499600830273032, + "grad_norm": 0.24261768980108364, + "learning_rate": 8.839617606955355e-06, + "loss": 0.0842, + "step": 10960 + }, + { + "epoch": 1.7531534408430465, + "grad_norm": 0.2090462161591236, + "learning_rate": 8.802720079367136e-06, + "loss": 0.0828, + "step": 10980 + }, + { + "epoch": 1.7563467986587897, + "grad_norm": 0.18696550737565137, + "learning_rate": 8.765839081583564e-06, + "loss": 0.082, + "step": 11000 + }, + { + "epoch": 1.759540156474533, + "grad_norm": 0.19121809093030445, + "learning_rate": 8.72897512278856e-06, + "loss": 0.0848, + "step": 11020 + }, + { + "epoch": 1.7627335142902762, + "grad_norm": 0.2102957076954447, + "learning_rate": 8.692128711930805e-06, + "loss": 0.084, + "step": 11040 + }, + { + "epoch": 1.7659268721060195, + "grad_norm": 0.20622666368626175, + "learning_rate": 8.655300357716716e-06, + "loss": 0.0845, + "step": 11060 + }, + { + "epoch": 1.7691202299217628, + "grad_norm": 0.2091346262679367, + "learning_rate": 8.618490568603409e-06, + "loss": 0.0821, + "step": 11080 + }, + { + "epoch": 1.772313587737506, + "grad_norm": 0.18255707260911555, + "learning_rate": 8.581699852791696e-06, + "loss": 0.0824, + "step": 11100 + }, + { + "epoch": 1.7755069455532493, + "grad_norm": 0.2201418888200012, + "learning_rate": 8.54492871821905e-06, + "loss": 0.0836, + "step": 11120 + }, + { + "epoch": 1.7787003033689925, + "grad_norm": 0.1875915274082898, + "learning_rate": 8.508177672552617e-06, + "loss": 0.0842, + "step": 11140 + }, + { + "epoch": 1.7818936611847358, + "grad_norm": 0.19600313792607987, + "learning_rate": 8.471447223182179e-06, + "loss": 0.0836, + "step": 11160 + }, + { + "epoch": 1.785087019000479, + "grad_norm": 0.19719362419525954, + "learning_rate": 8.434737877213172e-06, + "loss": 0.0856, + "step": 11180 + }, + { + "epoch": 1.788280376816222, + "grad_norm": 0.17156639629201742, + "learning_rate": 8.398050141459674e-06, + "loss": 0.0819, + "step": 11200 + }, + { + "epoch": 1.7914737346319654, + "grad_norm": 0.2039792577946715, + "learning_rate": 8.361384522437402e-06, + "loss": 0.0827, + "step": 11220 + }, + { + "epoch": 1.7946670924477086, + "grad_norm": 0.19179744785660258, + "learning_rate": 8.324741526356738e-06, + "loss": 0.0826, + "step": 11240 + }, + { + "epoch": 1.7978604502634519, + "grad_norm": 0.18215189474588084, + "learning_rate": 8.288121659115727e-06, + "loss": 0.0819, + "step": 11260 + }, + { + "epoch": 1.8010538080791951, + "grad_norm": 0.1644377928850563, + "learning_rate": 8.251525426293084e-06, + "loss": 0.0827, + "step": 11280 + }, + { + "epoch": 1.8042471658949384, + "grad_norm": 0.21222246533392128, + "learning_rate": 8.21495333314123e-06, + "loss": 0.0843, + "step": 11300 + }, + { + "epoch": 1.8074405237106816, + "grad_norm": 0.25181863269369087, + "learning_rate": 8.178405884579317e-06, + "loss": 0.0842, + "step": 11320 + }, + { + "epoch": 1.810633881526425, + "grad_norm": 0.2109399815982731, + "learning_rate": 8.141883585186241e-06, + "loss": 0.0829, + "step": 11340 + }, + { + "epoch": 1.8138272393421682, + "grad_norm": 0.18073042845539122, + "learning_rate": 8.10538693919369e-06, + "loss": 0.0834, + "step": 11360 + }, + { + "epoch": 1.8170205971579114, + "grad_norm": 0.20526943895282074, + "learning_rate": 8.068916450479174e-06, + "loss": 0.081, + "step": 11380 + }, + { + "epoch": 1.8202139549736547, + "grad_norm": 0.19361555670993416, + "learning_rate": 8.03247262255908e-06, + "loss": 0.0836, + "step": 11400 + }, + { + "epoch": 1.823407312789398, + "grad_norm": 0.24389934893406925, + "learning_rate": 7.996055958581703e-06, + "loss": 0.0828, + "step": 11420 + }, + { + "epoch": 1.8266006706051412, + "grad_norm": 0.1877153126969613, + "learning_rate": 7.959666961320314e-06, + "loss": 0.0823, + "step": 11440 + }, + { + "epoch": 1.8297940284208845, + "grad_norm": 0.19815842442257633, + "learning_rate": 7.923306133166218e-06, + "loss": 0.0827, + "step": 11460 + }, + { + "epoch": 1.8329873862366277, + "grad_norm": 0.21678547999171613, + "learning_rate": 7.886973976121797e-06, + "loss": 0.0821, + "step": 11480 + }, + { + "epoch": 1.836180744052371, + "grad_norm": 0.21618607294885436, + "learning_rate": 7.850670991793621e-06, + "loss": 0.0847, + "step": 11500 + }, + { + "epoch": 1.8393741018681142, + "grad_norm": 0.1704593983368394, + "learning_rate": 7.81439768138548e-06, + "loss": 0.082, + "step": 11520 + }, + { + "epoch": 1.8425674596838575, + "grad_norm": 0.18606341720829214, + "learning_rate": 7.778154545691481e-06, + "loss": 0.0812, + "step": 11540 + }, + { + "epoch": 1.8457608174996007, + "grad_norm": 0.21208825422427718, + "learning_rate": 7.741942085089146e-06, + "loss": 0.083, + "step": 11560 + }, + { + "epoch": 1.848954175315344, + "grad_norm": 0.18782574055868467, + "learning_rate": 7.705760799532485e-06, + "loss": 0.0828, + "step": 11580 + }, + { + "epoch": 1.8521475331310873, + "grad_norm": 0.19574167645932028, + "learning_rate": 7.669611188545103e-06, + "loss": 0.083, + "step": 11600 + }, + { + "epoch": 1.8553408909468305, + "grad_norm": 0.2065298678199762, + "learning_rate": 7.6334937512133e-06, + "loss": 0.0825, + "step": 11620 + }, + { + "epoch": 1.8585342487625738, + "grad_norm": 0.1977503317300438, + "learning_rate": 7.597408986179184e-06, + "loss": 0.0806, + "step": 11640 + }, + { + "epoch": 1.861727606578317, + "grad_norm": 0.20586182397186595, + "learning_rate": 7.561357391633789e-06, + "loss": 0.0824, + "step": 11660 + }, + { + "epoch": 1.8649209643940603, + "grad_norm": 0.21998998145214102, + "learning_rate": 7.525339465310183e-06, + "loss": 0.0838, + "step": 11680 + }, + { + "epoch": 1.8681143222098036, + "grad_norm": 0.24487809053970366, + "learning_rate": 7.4893557044766145e-06, + "loss": 0.0821, + "step": 11700 + }, + { + "epoch": 1.8713076800255468, + "grad_norm": 0.18687218223534408, + "learning_rate": 7.453406605929637e-06, + "loss": 0.0806, + "step": 11720 + }, + { + "epoch": 1.87450103784129, + "grad_norm": 0.17318503959159254, + "learning_rate": 7.417492665987247e-06, + "loss": 0.0819, + "step": 11740 + }, + { + "epoch": 1.8776943956570333, + "grad_norm": 0.18945197729794094, + "learning_rate": 7.3816143804820454e-06, + "loss": 0.0835, + "step": 11760 + }, + { + "epoch": 1.8808877534727766, + "grad_norm": 0.20142501192350587, + "learning_rate": 7.345772244754377e-06, + "loss": 0.0844, + "step": 11780 + }, + { + "epoch": 1.8840811112885198, + "grad_norm": 0.20568732816869706, + "learning_rate": 7.309966753645496e-06, + "loss": 0.0801, + "step": 11800 + }, + { + "epoch": 1.887274469104263, + "grad_norm": 0.20182816399217324, + "learning_rate": 7.274198401490744e-06, + "loss": 0.0846, + "step": 11820 + }, + { + "epoch": 1.8904678269200064, + "grad_norm": 0.20018924573509358, + "learning_rate": 7.2384676821127135e-06, + "loss": 0.0798, + "step": 11840 + }, + { + "epoch": 1.8936611847357496, + "grad_norm": 0.28199792560782483, + "learning_rate": 7.202775088814429e-06, + "loss": 0.0815, + "step": 11860 + }, + { + "epoch": 1.8968545425514929, + "grad_norm": 0.22764478972933266, + "learning_rate": 7.1671211143725485e-06, + "loss": 0.0815, + "step": 11880 + }, + { + "epoch": 1.9000479003672361, + "grad_norm": 0.1981593984765646, + "learning_rate": 7.131506251030547e-06, + "loss": 0.0809, + "step": 11900 + }, + { + "epoch": 1.9032412581829794, + "grad_norm": 0.20992169378762218, + "learning_rate": 7.095930990491933e-06, + "loss": 0.0809, + "step": 11920 + }, + { + "epoch": 1.9064346159987227, + "grad_norm": 0.19005910859773092, + "learning_rate": 7.060395823913447e-06, + "loss": 0.0842, + "step": 11940 + }, + { + "epoch": 1.909627973814466, + "grad_norm": 0.19205175219083725, + "learning_rate": 7.024901241898292e-06, + "loss": 0.0819, + "step": 11960 + }, + { + "epoch": 1.9128213316302092, + "grad_norm": 0.20008872943717196, + "learning_rate": 6.9894477344893505e-06, + "loss": 0.0819, + "step": 11980 + }, + { + "epoch": 1.9160146894459524, + "grad_norm": 0.1773872749793287, + "learning_rate": 6.9540357911624336e-06, + "loss": 0.0823, + "step": 12000 + }, + { + "epoch": 1.9192080472616957, + "grad_norm": 0.19417086960624413, + "learning_rate": 6.918665900819497e-06, + "loss": 0.0791, + "step": 12020 + }, + { + "epoch": 1.922401405077439, + "grad_norm": 0.1814650138072353, + "learning_rate": 6.883338551781923e-06, + "loss": 0.0811, + "step": 12040 + }, + { + "epoch": 1.9255947628931822, + "grad_norm": 0.1702657944804681, + "learning_rate": 6.8480542317837505e-06, + "loss": 0.0803, + "step": 12060 + }, + { + "epoch": 1.9287881207089255, + "grad_norm": 0.18416550882743182, + "learning_rate": 6.812813427964963e-06, + "loss": 0.081, + "step": 12080 + }, + { + "epoch": 1.9319814785246687, + "grad_norm": 0.21054620503327667, + "learning_rate": 6.77761662686475e-06, + "loss": 0.0837, + "step": 12100 + }, + { + "epoch": 1.935174836340412, + "grad_norm": 0.1788773690242681, + "learning_rate": 6.742464314414791e-06, + "loss": 0.0809, + "step": 12120 + }, + { + "epoch": 1.9383681941561552, + "grad_norm": 0.19629223674022553, + "learning_rate": 6.707356975932559e-06, + "loss": 0.0821, + "step": 12140 + }, + { + "epoch": 1.9415615519718985, + "grad_norm": 0.17739114236704748, + "learning_rate": 6.672295096114597e-06, + "loss": 0.0816, + "step": 12160 + }, + { + "epoch": 1.9447549097876418, + "grad_norm": 0.20468934483234205, + "learning_rate": 6.637279159029851e-06, + "loss": 0.0827, + "step": 12180 + }, + { + "epoch": 1.947948267603385, + "grad_norm": 0.16608032221866548, + "learning_rate": 6.602309648112968e-06, + "loss": 0.0792, + "step": 12200 + }, + { + "epoch": 1.9511416254191283, + "grad_norm": 0.1759677545684069, + "learning_rate": 6.567387046157632e-06, + "loss": 0.0785, + "step": 12220 + }, + { + "epoch": 1.9543349832348715, + "grad_norm": 0.18405948214393053, + "learning_rate": 6.532511835309896e-06, + "loss": 0.0822, + "step": 12240 + }, + { + "epoch": 1.9575283410506148, + "grad_norm": 0.2012173937759783, + "learning_rate": 6.497684497061531e-06, + "loss": 0.0818, + "step": 12260 + }, + { + "epoch": 1.960721698866358, + "grad_norm": 0.2057906504416338, + "learning_rate": 6.462905512243359e-06, + "loss": 0.0806, + "step": 12280 + }, + { + "epoch": 1.9639150566821013, + "grad_norm": 0.20687177701805626, + "learning_rate": 6.428175361018643e-06, + "loss": 0.0794, + "step": 12300 + }, + { + "epoch": 1.9671084144978446, + "grad_norm": 0.2064196549144857, + "learning_rate": 6.393494522876428e-06, + "loss": 0.0816, + "step": 12320 + }, + { + "epoch": 1.9703017723135878, + "grad_norm": 0.2133102540844893, + "learning_rate": 6.358863476624948e-06, + "loss": 0.0821, + "step": 12340 + }, + { + "epoch": 1.973495130129331, + "grad_norm": 0.18497415279048168, + "learning_rate": 6.324282700385e-06, + "loss": 0.0824, + "step": 12360 + }, + { + "epoch": 1.9766884879450743, + "grad_norm": 0.19520821054839646, + "learning_rate": 6.289752671583344e-06, + "loss": 0.0792, + "step": 12380 + }, + { + "epoch": 1.9798818457608176, + "grad_norm": 0.18726221094986775, + "learning_rate": 6.255273866946119e-06, + "loss": 0.0799, + "step": 12400 + }, + { + "epoch": 1.9830752035765609, + "grad_norm": 0.19525199269461027, + "learning_rate": 6.22084676249225e-06, + "loss": 0.0796, + "step": 12420 + }, + { + "epoch": 1.9862685613923041, + "grad_norm": 0.16345775381577554, + "learning_rate": 6.186471833526888e-06, + "loss": 0.082, + "step": 12440 + }, + { + "epoch": 1.9894619192080474, + "grad_norm": 0.1972221294843483, + "learning_rate": 6.15214955463484e-06, + "loss": 0.0787, + "step": 12460 + }, + { + "epoch": 1.9926552770237906, + "grad_norm": 0.1935374722805669, + "learning_rate": 6.117880399674016e-06, + "loss": 0.0827, + "step": 12480 + }, + { + "epoch": 1.995848634839534, + "grad_norm": 0.18315518408993714, + "learning_rate": 6.083664841768901e-06, + "loss": 0.0816, + "step": 12500 + }, + { + "epoch": 1.9990419926552772, + "grad_norm": 0.16860052008855017, + "learning_rate": 6.049503353304e-06, + "loss": 0.0844, + "step": 12520 + }, + { + "epoch": 2.0022353504710204, + "grad_norm": 0.18498027675472176, + "learning_rate": 6.015396405917333e-06, + "loss": 0.061, + "step": 12540 + }, + { + "epoch": 2.0054287082867637, + "grad_norm": 0.20247862079416473, + "learning_rate": 5.98134447049392e-06, + "loss": 0.0494, + "step": 12560 + }, + { + "epoch": 2.008622066102507, + "grad_norm": 0.17717972255777836, + "learning_rate": 5.947348017159272e-06, + "loss": 0.0496, + "step": 12580 + }, + { + "epoch": 2.01181542391825, + "grad_norm": 0.17560899509079128, + "learning_rate": 5.913407515272918e-06, + "loss": 0.0484, + "step": 12600 + }, + { + "epoch": 2.0150087817339934, + "grad_norm": 0.2107019559801837, + "learning_rate": 5.879523433421903e-06, + "loss": 0.0455, + "step": 12620 + }, + { + "epoch": 2.0182021395497367, + "grad_norm": 0.17228228604398835, + "learning_rate": 5.845696239414336e-06, + "loss": 0.0481, + "step": 12640 + }, + { + "epoch": 2.02139549736548, + "grad_norm": 0.16576058508327604, + "learning_rate": 5.8119264002729244e-06, + "loss": 0.0484, + "step": 12660 + }, + { + "epoch": 2.0245888551812232, + "grad_norm": 0.17885300287909717, + "learning_rate": 5.778214382228524e-06, + "loss": 0.047, + "step": 12680 + }, + { + "epoch": 2.0277822129969665, + "grad_norm": 0.20671449403256986, + "learning_rate": 5.744560650713704e-06, + "loss": 0.0471, + "step": 12700 + }, + { + "epoch": 2.0309755708127097, + "grad_norm": 0.20083359478447635, + "learning_rate": 5.710965670356332e-06, + "loss": 0.0479, + "step": 12720 + }, + { + "epoch": 2.034168928628453, + "grad_norm": 0.18961936533749266, + "learning_rate": 5.6774299049731325e-06, + "loss": 0.0478, + "step": 12740 + }, + { + "epoch": 2.0373622864441963, + "grad_norm": 0.21979140727547378, + "learning_rate": 5.643953817563318e-06, + "loss": 0.0453, + "step": 12760 + }, + { + "epoch": 2.0405556442599395, + "grad_norm": 0.16165099720000836, + "learning_rate": 5.610537870302164e-06, + "loss": 0.0476, + "step": 12780 + }, + { + "epoch": 2.0437490020756828, + "grad_norm": 0.18343428699528758, + "learning_rate": 5.577182524534657e-06, + "loss": 0.0478, + "step": 12800 + }, + { + "epoch": 2.046942359891426, + "grad_norm": 0.17215552651589366, + "learning_rate": 5.5438882407691e-06, + "loss": 0.0472, + "step": 12820 + }, + { + "epoch": 2.0501357177071693, + "grad_norm": 0.1624976046442029, + "learning_rate": 5.510655478670769e-06, + "loss": 0.0478, + "step": 12840 + }, + { + "epoch": 2.0533290755229126, + "grad_norm": 0.22026015940397797, + "learning_rate": 5.4774846970555615e-06, + "loss": 0.0461, + "step": 12860 + }, + { + "epoch": 2.056522433338656, + "grad_norm": 0.17519613837123435, + "learning_rate": 5.444376353883678e-06, + "loss": 0.0462, + "step": 12880 + }, + { + "epoch": 2.059715791154399, + "grad_norm": 0.18277575133361915, + "learning_rate": 5.411330906253269e-06, + "loss": 0.0455, + "step": 12900 + }, + { + "epoch": 2.0629091489701423, + "grad_norm": 0.18787731365044255, + "learning_rate": 5.378348810394143e-06, + "loss": 0.0462, + "step": 12920 + }, + { + "epoch": 2.066102506785885, + "grad_norm": 0.18201430894959444, + "learning_rate": 5.3454305216614766e-06, + "loss": 0.0473, + "step": 12940 + }, + { + "epoch": 2.0692958646016284, + "grad_norm": 0.1904233887751224, + "learning_rate": 5.312576494529507e-06, + "loss": 0.0494, + "step": 12960 + }, + { + "epoch": 2.0724892224173717, + "grad_norm": 0.18985642952053444, + "learning_rate": 5.279787182585271e-06, + "loss": 0.0462, + "step": 12980 + }, + { + "epoch": 2.075682580233115, + "grad_norm": 0.1582812242047444, + "learning_rate": 5.247063038522329e-06, + "loss": 0.0469, + "step": 13000 + }, + { + "epoch": 2.078875938048858, + "grad_norm": 0.19286531510895663, + "learning_rate": 5.21440451413455e-06, + "loss": 0.0465, + "step": 13020 + }, + { + "epoch": 2.0820692958646014, + "grad_norm": 0.22047888942684946, + "learning_rate": 5.181812060309825e-06, + "loss": 0.0463, + "step": 13040 + }, + { + "epoch": 2.0852626536803447, + "grad_norm": 0.22499631209380672, + "learning_rate": 5.149286127023874e-06, + "loss": 0.0467, + "step": 13060 + }, + { + "epoch": 2.088456011496088, + "grad_norm": 0.18796568419290619, + "learning_rate": 5.1168271633340235e-06, + "loss": 0.0471, + "step": 13080 + }, + { + "epoch": 2.091649369311831, + "grad_norm": 0.1796719273681106, + "learning_rate": 5.084435617373018e-06, + "loss": 0.048, + "step": 13100 + }, + { + "epoch": 2.0948427271275745, + "grad_norm": 0.1916078526748605, + "learning_rate": 5.052111936342812e-06, + "loss": 0.0467, + "step": 13120 + }, + { + "epoch": 2.0980360849433177, + "grad_norm": 0.19878847514842057, + "learning_rate": 5.019856566508412e-06, + "loss": 0.0478, + "step": 13140 + }, + { + "epoch": 2.101229442759061, + "grad_norm": 0.2088933392167675, + "learning_rate": 4.9876699531917186e-06, + "loss": 0.0473, + "step": 13160 + }, + { + "epoch": 2.1044228005748042, + "grad_norm": 0.20402583213332395, + "learning_rate": 4.95555254076536e-06, + "loss": 0.0457, + "step": 13180 + }, + { + "epoch": 2.1076161583905475, + "grad_norm": 0.16605435030952836, + "learning_rate": 4.923504772646573e-06, + "loss": 0.0473, + "step": 13200 + }, + { + "epoch": 2.1108095162062908, + "grad_norm": 0.17651776985556464, + "learning_rate": 4.891527091291071e-06, + "loss": 0.0477, + "step": 13220 + }, + { + "epoch": 2.114002874022034, + "grad_norm": 0.1763790661182835, + "learning_rate": 4.859619938186947e-06, + "loss": 0.0456, + "step": 13240 + }, + { + "epoch": 2.1171962318377773, + "grad_norm": 0.18886660022445972, + "learning_rate": 4.827783753848575e-06, + "loss": 0.0455, + "step": 13260 + }, + { + "epoch": 2.1203895896535205, + "grad_norm": 0.2059211240085781, + "learning_rate": 4.796018977810514e-06, + "loss": 0.0457, + "step": 13280 + }, + { + "epoch": 2.123582947469264, + "grad_norm": 0.19168043665328116, + "learning_rate": 4.76432604862145e-06, + "loss": 0.046, + "step": 13300 + }, + { + "epoch": 2.126776305285007, + "grad_norm": 0.17778767466228898, + "learning_rate": 4.732705403838159e-06, + "loss": 0.0465, + "step": 13320 + }, + { + "epoch": 2.1299696631007503, + "grad_norm": 0.170308319213917, + "learning_rate": 4.701157480019429e-06, + "loss": 0.0474, + "step": 13340 + }, + { + "epoch": 2.1331630209164936, + "grad_norm": 0.1711104888651996, + "learning_rate": 4.669682712720065e-06, + "loss": 0.0462, + "step": 13360 + }, + { + "epoch": 2.136356378732237, + "grad_norm": 0.1825464435577293, + "learning_rate": 4.638281536484854e-06, + "loss": 0.0485, + "step": 13380 + }, + { + "epoch": 2.13954973654798, + "grad_norm": 0.1835185156049789, + "learning_rate": 4.606954384842587e-06, + "loss": 0.0455, + "step": 13400 + }, + { + "epoch": 2.1427430943637233, + "grad_norm": 0.19538449656271248, + "learning_rate": 4.575701690300051e-06, + "loss": 0.0457, + "step": 13420 + }, + { + "epoch": 2.1459364521794666, + "grad_norm": 0.20119853731280407, + "learning_rate": 4.544523884336073e-06, + "loss": 0.0462, + "step": 13440 + }, + { + "epoch": 2.14912980999521, + "grad_norm": 0.19230165287264112, + "learning_rate": 4.513421397395563e-06, + "loss": 0.0449, + "step": 13460 + }, + { + "epoch": 2.152323167810953, + "grad_norm": 0.19371541515972485, + "learning_rate": 4.482394658883557e-06, + "loss": 0.0465, + "step": 13480 + }, + { + "epoch": 2.1555165256266964, + "grad_norm": 0.2749584429863373, + "learning_rate": 4.451444097159301e-06, + "loss": 0.0465, + "step": 13500 + }, + { + "epoch": 2.1587098834424396, + "grad_norm": 0.181430213502962, + "learning_rate": 4.4205701395303424e-06, + "loss": 0.0469, + "step": 13520 + }, + { + "epoch": 2.161903241258183, + "grad_norm": 0.21832000463916046, + "learning_rate": 4.38977321224661e-06, + "loss": 0.0472, + "step": 13540 + }, + { + "epoch": 2.165096599073926, + "grad_norm": 0.36594927042777403, + "learning_rate": 4.3590537404945535e-06, + "loss": 0.0471, + "step": 13560 + }, + { + "epoch": 2.1682899568896694, + "grad_norm": 0.19062769875876745, + "learning_rate": 4.3284121483912525e-06, + "loss": 0.0464, + "step": 13580 + }, + { + "epoch": 2.1714833147054127, + "grad_norm": 0.18521477830070004, + "learning_rate": 4.297848858978569e-06, + "loss": 0.0461, + "step": 13600 + }, + { + "epoch": 2.174676672521156, + "grad_norm": 0.2064934921930085, + "learning_rate": 4.2673642942173184e-06, + "loss": 0.0451, + "step": 13620 + }, + { + "epoch": 2.177870030336899, + "grad_norm": 0.19089143723142035, + "learning_rate": 4.236958874981423e-06, + "loss": 0.0448, + "step": 13640 + }, + { + "epoch": 2.1810633881526424, + "grad_norm": 0.17162658742427372, + "learning_rate": 4.206633021052115e-06, + "loss": 0.0453, + "step": 13660 + }, + { + "epoch": 2.1842567459683857, + "grad_norm": 0.18039037729927956, + "learning_rate": 4.176387151112134e-06, + "loss": 0.0455, + "step": 13680 + }, + { + "epoch": 2.187450103784129, + "grad_norm": 0.16510411035975564, + "learning_rate": 4.1462216827399585e-06, + "loss": 0.0446, + "step": 13700 + }, + { + "epoch": 2.190643461599872, + "grad_norm": 0.2215703230886645, + "learning_rate": 4.116137032404026e-06, + "loss": 0.0453, + "step": 13720 + }, + { + "epoch": 2.1938368194156155, + "grad_norm": 0.18140462418275824, + "learning_rate": 4.0861336154569855e-06, + "loss": 0.0446, + "step": 13740 + }, + { + "epoch": 2.1970301772313587, + "grad_norm": 0.164963005058681, + "learning_rate": 4.056211846129977e-06, + "loss": 0.0451, + "step": 13760 + }, + { + "epoch": 2.200223535047102, + "grad_norm": 0.22161978868062865, + "learning_rate": 4.0263721375269e-06, + "loss": 0.0439, + "step": 13780 + }, + { + "epoch": 2.2034168928628453, + "grad_norm": 0.18997163122166422, + "learning_rate": 3.99661490161871e-06, + "loss": 0.0452, + "step": 13800 + }, + { + "epoch": 2.2066102506785885, + "grad_norm": 0.19721572060634018, + "learning_rate": 3.966940549237728e-06, + "loss": 0.046, + "step": 13820 + }, + { + "epoch": 2.2098036084943318, + "grad_norm": 0.1613696871656721, + "learning_rate": 3.937349490071989e-06, + "loss": 0.0451, + "step": 13840 + }, + { + "epoch": 2.212996966310075, + "grad_norm": 0.23649764683113925, + "learning_rate": 3.9078421326595575e-06, + "loss": 0.0473, + "step": 13860 + }, + { + "epoch": 2.2161903241258183, + "grad_norm": 0.15900455957581072, + "learning_rate": 3.8784188843829075e-06, + "loss": 0.0467, + "step": 13880 + }, + { + "epoch": 2.2193836819415615, + "grad_norm": 0.16623211370488078, + "learning_rate": 3.849080151463284e-06, + "loss": 0.0447, + "step": 13900 + }, + { + "epoch": 2.222577039757305, + "grad_norm": 0.23855246445899472, + "learning_rate": 3.819826338955115e-06, + "loss": 0.045, + "step": 13920 + }, + { + "epoch": 2.225770397573048, + "grad_norm": 0.16852273819977373, + "learning_rate": 3.7906578507403925e-06, + "loss": 0.044, + "step": 13940 + }, + { + "epoch": 2.2289637553887913, + "grad_norm": 0.19176422233347587, + "learning_rate": 3.761575089523114e-06, + "loss": 0.0451, + "step": 13960 + }, + { + "epoch": 2.2321571132045346, + "grad_norm": 0.19217003400101632, + "learning_rate": 3.7325784568237267e-06, + "loss": 0.0456, + "step": 13980 + }, + { + "epoch": 2.235350471020278, + "grad_norm": 0.2142815186061357, + "learning_rate": 3.7036683529735616e-06, + "loss": 0.0438, + "step": 14000 + }, + { + "epoch": 2.238543828836021, + "grad_norm": 0.16980952681099654, + "learning_rate": 3.6748451771093386e-06, + "loss": 0.0456, + "step": 14020 + }, + { + "epoch": 2.2417371866517644, + "grad_norm": 0.20792979968816608, + "learning_rate": 3.6461093271676216e-06, + "loss": 0.045, + "step": 14040 + }, + { + "epoch": 2.2449305444675076, + "grad_norm": 0.19749481308114683, + "learning_rate": 3.6174611998793486e-06, + "loss": 0.0455, + "step": 14060 + }, + { + "epoch": 2.248123902283251, + "grad_norm": 0.208757882997406, + "learning_rate": 3.5889011907643523e-06, + "loss": 0.0468, + "step": 14080 + }, + { + "epoch": 2.251317260098994, + "grad_norm": 0.18603971145921822, + "learning_rate": 3.5604296941258854e-06, + "loss": 0.0456, + "step": 14100 + }, + { + "epoch": 2.2545106179147374, + "grad_norm": 0.24232186850665094, + "learning_rate": 3.532047103045185e-06, + "loss": 0.0442, + "step": 14120 + }, + { + "epoch": 2.2577039757304806, + "grad_norm": 0.24810029826855062, + "learning_rate": 3.503753809376059e-06, + "loss": 0.0463, + "step": 14140 + }, + { + "epoch": 2.260897333546224, + "grad_norm": 0.23406287255675895, + "learning_rate": 3.475550203739452e-06, + "loss": 0.0451, + "step": 14160 + }, + { + "epoch": 2.264090691361967, + "grad_norm": 0.17282967387502232, + "learning_rate": 3.4474366755180644e-06, + "loss": 0.0453, + "step": 14180 + }, + { + "epoch": 2.2672840491777104, + "grad_norm": 0.21126534883401732, + "learning_rate": 3.419413612850976e-06, + "loss": 0.0461, + "step": 14200 + }, + { + "epoch": 2.2704774069934537, + "grad_norm": 0.16104640464566056, + "learning_rate": 3.391481402628297e-06, + "loss": 0.0476, + "step": 14220 + }, + { + "epoch": 2.273670764809197, + "grad_norm": 0.21435527733602905, + "learning_rate": 3.363640430485804e-06, + "loss": 0.0446, + "step": 14240 + }, + { + "epoch": 2.27686412262494, + "grad_norm": 0.18548507359762656, + "learning_rate": 3.3358910807996325e-06, + "loss": 0.0451, + "step": 14260 + }, + { + "epoch": 2.2800574804406835, + "grad_norm": 0.19423383437023095, + "learning_rate": 3.3082337366809704e-06, + "loss": 0.0448, + "step": 14280 + }, + { + "epoch": 2.2832508382564267, + "grad_norm": 0.17237074664312235, + "learning_rate": 3.2806687799707647e-06, + "loss": 0.0459, + "step": 14300 + }, + { + "epoch": 2.28644419607217, + "grad_norm": 0.22791506612179063, + "learning_rate": 3.253196591234443e-06, + "loss": 0.0449, + "step": 14320 + }, + { + "epoch": 2.2896375538879132, + "grad_norm": 0.18890323777751128, + "learning_rate": 3.2258175497566678e-06, + "loss": 0.0449, + "step": 14340 + }, + { + "epoch": 2.2928309117036565, + "grad_norm": 0.22098418299523961, + "learning_rate": 3.198532033536107e-06, + "loss": 0.0437, + "step": 14360 + }, + { + "epoch": 2.2960242695193998, + "grad_norm": 0.22834203263219127, + "learning_rate": 3.1713404192801945e-06, + "loss": 0.0462, + "step": 14380 + }, + { + "epoch": 2.299217627335143, + "grad_norm": 0.19033969048906568, + "learning_rate": 3.144243082399947e-06, + "loss": 0.0454, + "step": 14400 + }, + { + "epoch": 2.3024109851508863, + "grad_norm": 0.1772642418355086, + "learning_rate": 3.1172403970047725e-06, + "loss": 0.0441, + "step": 14420 + }, + { + "epoch": 2.3056043429666295, + "grad_norm": 0.2048657544909403, + "learning_rate": 3.0903327358973168e-06, + "loss": 0.0446, + "step": 14440 + }, + { + "epoch": 2.308797700782373, + "grad_norm": 0.18540450076918674, + "learning_rate": 3.0635204705682976e-06, + "loss": 0.0451, + "step": 14460 + }, + { + "epoch": 2.311991058598116, + "grad_norm": 0.18445665460036134, + "learning_rate": 3.0368039711913867e-06, + "loss": 0.0459, + "step": 14480 + }, + { + "epoch": 2.3151844164138593, + "grad_norm": 0.22336940402363192, + "learning_rate": 3.0101836066181033e-06, + "loss": 0.0455, + "step": 14500 + }, + { + "epoch": 2.3183777742296026, + "grad_norm": 0.16285692399794796, + "learning_rate": 2.983659744372721e-06, + "loss": 0.045, + "step": 14520 + }, + { + "epoch": 2.321571132045346, + "grad_norm": 0.19697000745739243, + "learning_rate": 2.9572327506471775e-06, + "loss": 0.0454, + "step": 14540 + }, + { + "epoch": 2.324764489861089, + "grad_norm": 0.1950278510185452, + "learning_rate": 2.9309029902960395e-06, + "loss": 0.0452, + "step": 14560 + }, + { + "epoch": 2.3279578476768323, + "grad_norm": 0.1926073736357789, + "learning_rate": 2.9046708268314494e-06, + "loss": 0.0455, + "step": 14580 + }, + { + "epoch": 2.3311512054925756, + "grad_norm": 0.5787988360468825, + "learning_rate": 2.8785366224181265e-06, + "loss": 0.047, + "step": 14600 + }, + { + "epoch": 2.334344563308319, + "grad_norm": 0.19178497872154512, + "learning_rate": 2.8525007378683433e-06, + "loss": 0.0441, + "step": 14620 + }, + { + "epoch": 2.337537921124062, + "grad_norm": 0.20463851817417028, + "learning_rate": 2.8265635326369557e-06, + "loss": 0.0443, + "step": 14640 + }, + { + "epoch": 2.3407312789398054, + "grad_norm": 0.18832526122080892, + "learning_rate": 2.8007253648164502e-06, + "loss": 0.0447, + "step": 14660 + }, + { + "epoch": 2.3439246367555486, + "grad_norm": 0.25535504048141416, + "learning_rate": 2.7749865911319786e-06, + "loss": 0.0462, + "step": 14680 + }, + { + "epoch": 2.347117994571292, + "grad_norm": 0.2783926831983617, + "learning_rate": 2.74934756693645e-06, + "loss": 0.0461, + "step": 14700 + }, + { + "epoch": 2.350311352387035, + "grad_norm": 0.1799001156488928, + "learning_rate": 2.7238086462056125e-06, + "loss": 0.0451, + "step": 14720 + }, + { + "epoch": 2.3535047102027784, + "grad_norm": 0.22749744937087824, + "learning_rate": 2.6983701815331844e-06, + "loss": 0.0449, + "step": 14740 + }, + { + "epoch": 2.3566980680185217, + "grad_norm": 0.192235427214562, + "learning_rate": 2.6730325241259605e-06, + "loss": 0.0447, + "step": 14760 + }, + { + "epoch": 2.359891425834265, + "grad_norm": 0.1779393552771597, + "learning_rate": 2.647796023798991e-06, + "loss": 0.0455, + "step": 14780 + }, + { + "epoch": 2.3630847836500077, + "grad_norm": 0.17636063193070986, + "learning_rate": 2.6226610289707235e-06, + "loss": 0.0453, + "step": 14800 + }, + { + "epoch": 2.3662781414657514, + "grad_norm": 0.17751151289004394, + "learning_rate": 2.5976278866582226e-06, + "loss": 0.0439, + "step": 14820 + }, + { + "epoch": 2.3694714992814943, + "grad_norm": 0.1612714192997329, + "learning_rate": 2.5726969424723514e-06, + "loss": 0.0451, + "step": 14840 + }, + { + "epoch": 2.372664857097238, + "grad_norm": 0.19257379967637422, + "learning_rate": 2.5478685406130143e-06, + "loss": 0.0535, + "step": 14860 + }, + { + "epoch": 2.3758582149129808, + "grad_norm": 0.18593345377491236, + "learning_rate": 2.5231430238644106e-06, + "loss": 0.045, + "step": 14880 + }, + { + "epoch": 2.3790515727287245, + "grad_norm": 0.19051880160399431, + "learning_rate": 2.4985207335902863e-06, + "loss": 0.0451, + "step": 14900 + }, + { + "epoch": 2.3822449305444673, + "grad_norm": 0.18531119849649635, + "learning_rate": 2.4740020097292318e-06, + "loss": 0.0426, + "step": 14920 + }, + { + "epoch": 2.385438288360211, + "grad_norm": 0.23011458580940014, + "learning_rate": 2.4495871907899816e-06, + "loss": 0.0456, + "step": 14940 + }, + { + "epoch": 2.388631646175954, + "grad_norm": 0.22814782369226178, + "learning_rate": 2.425276613846755e-06, + "loss": 0.0458, + "step": 14960 + }, + { + "epoch": 2.3918250039916975, + "grad_norm": 0.18964633782059312, + "learning_rate": 2.401070614534585e-06, + "loss": 0.0445, + "step": 14980 + }, + { + "epoch": 2.3950183618074403, + "grad_norm": 0.18585844070460122, + "learning_rate": 2.3769695270446903e-06, + "loss": 0.0433, + "step": 15000 + }, + { + "epoch": 2.398211719623184, + "grad_norm": 0.2173023589979796, + "learning_rate": 2.352973684119868e-06, + "loss": 0.0452, + "step": 15020 + }, + { + "epoch": 2.401405077438927, + "grad_norm": 0.1888223260670983, + "learning_rate": 2.329083417049899e-06, + "loss": 0.0453, + "step": 15040 + }, + { + "epoch": 2.40459843525467, + "grad_norm": 0.2000345304946633, + "learning_rate": 2.3052990556669587e-06, + "loss": 0.0443, + "step": 15060 + }, + { + "epoch": 2.4077917930704134, + "grad_norm": 0.350402818921811, + "learning_rate": 2.2816209283410815e-06, + "loss": 0.0446, + "step": 15080 + }, + { + "epoch": 2.4109851508861566, + "grad_norm": 0.17540258992531277, + "learning_rate": 2.258049361975616e-06, + "loss": 0.0448, + "step": 15100 + }, + { + "epoch": 2.4141785087019, + "grad_norm": 0.2240022668610996, + "learning_rate": 2.234584682002726e-06, + "loss": 0.0436, + "step": 15120 + }, + { + "epoch": 2.417371866517643, + "grad_norm": 0.19377910419185784, + "learning_rate": 2.211227212378877e-06, + "loss": 0.0449, + "step": 15140 + }, + { + "epoch": 2.4205652243333864, + "grad_norm": 0.18307979574559963, + "learning_rate": 2.1879772755803763e-06, + "loss": 0.0437, + "step": 15160 + }, + { + "epoch": 2.4237585821491296, + "grad_norm": 0.18479960232316164, + "learning_rate": 2.1648351925989253e-06, + "loss": 0.0469, + "step": 15180 + }, + { + "epoch": 2.426951939964873, + "grad_norm": 0.19121025995799099, + "learning_rate": 2.1418012829371735e-06, + "loss": 0.0438, + "step": 15200 + }, + { + "epoch": 2.430145297780616, + "grad_norm": 0.19858616833926596, + "learning_rate": 2.1188758646043206e-06, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 2.4333386555963594, + "grad_norm": 0.18772227683807235, + "learning_rate": 2.0960592541117143e-06, + "loss": 0.0452, + "step": 15240 + }, + { + "epoch": 2.4365320134121027, + "grad_norm": 0.1743929147084694, + "learning_rate": 2.0733517664684944e-06, + "loss": 0.0438, + "step": 15260 + }, + { + "epoch": 2.439725371227846, + "grad_norm": 0.18605377215327853, + "learning_rate": 2.050753715177236e-06, + "loss": 0.0464, + "step": 15280 + }, + { + "epoch": 2.442918729043589, + "grad_norm": 0.19099944392969617, + "learning_rate": 2.0282654122296154e-06, + "loss": 0.0434, + "step": 15300 + }, + { + "epoch": 2.4461120868593325, + "grad_norm": 0.19579885958359836, + "learning_rate": 2.0058871681021087e-06, + "loss": 0.0433, + "step": 15320 + }, + { + "epoch": 2.4493054446750757, + "grad_norm": 0.2037719797424841, + "learning_rate": 1.983619291751716e-06, + "loss": 0.0445, + "step": 15340 + }, + { + "epoch": 2.452498802490819, + "grad_norm": 0.2288507482341902, + "learning_rate": 1.961462090611673e-06, + "loss": 0.0445, + "step": 15360 + }, + { + "epoch": 2.4556921603065622, + "grad_norm": 0.18192991033918157, + "learning_rate": 1.9394158705872244e-06, + "loss": 0.0453, + "step": 15380 + }, + { + "epoch": 2.4588855181223055, + "grad_norm": 0.2180936188857526, + "learning_rate": 1.9174809360513935e-06, + "loss": 0.045, + "step": 15400 + }, + { + "epoch": 2.4620788759380487, + "grad_norm": 0.1894861914106852, + "learning_rate": 1.8956575898407847e-06, + "loss": 0.0464, + "step": 15420 + }, + { + "epoch": 2.465272233753792, + "grad_norm": 0.2021847245639915, + "learning_rate": 1.8739461332513953e-06, + "loss": 0.0459, + "step": 15440 + }, + { + "epoch": 2.4684655915695353, + "grad_norm": 0.1992201840351267, + "learning_rate": 1.85234686603446e-06, + "loss": 0.044, + "step": 15460 + }, + { + "epoch": 2.4716589493852785, + "grad_norm": 0.18202769181733872, + "learning_rate": 1.8308600863923164e-06, + "loss": 0.0464, + "step": 15480 + }, + { + "epoch": 2.474852307201022, + "grad_norm": 0.17956705043459079, + "learning_rate": 1.8094860909742795e-06, + "loss": 0.0457, + "step": 15500 + }, + { + "epoch": 2.478045665016765, + "grad_norm": 0.1780847660838803, + "learning_rate": 1.78822517487255e-06, + "loss": 0.044, + "step": 15520 + }, + { + "epoch": 2.4812390228325083, + "grad_norm": 0.19200813107543122, + "learning_rate": 1.7670776316181427e-06, + "loss": 0.0432, + "step": 15540 + }, + { + "epoch": 2.4844323806482516, + "grad_norm": 0.2516917996505797, + "learning_rate": 1.746043753176836e-06, + "loss": 0.0448, + "step": 15560 + }, + { + "epoch": 2.487625738463995, + "grad_norm": 0.17194174394098138, + "learning_rate": 1.7251238299451301e-06, + "loss": 0.0449, + "step": 15580 + }, + { + "epoch": 2.490819096279738, + "grad_norm": 0.17011442140145003, + "learning_rate": 1.7043181507462448e-06, + "loss": 0.0457, + "step": 15600 + }, + { + "epoch": 2.4940124540954813, + "grad_norm": 0.17376564573157416, + "learning_rate": 1.6836270028261326e-06, + "loss": 0.0446, + "step": 15620 + }, + { + "epoch": 2.4972058119112246, + "grad_norm": 0.2600424543600025, + "learning_rate": 1.66305067184952e-06, + "loss": 0.0435, + "step": 15640 + }, + { + "epoch": 2.500399169726968, + "grad_norm": 0.1728773334170149, + "learning_rate": 1.6425894418959433e-06, + "loss": 0.0444, + "step": 15660 + }, + { + "epoch": 2.503592527542711, + "grad_norm": 0.2117397902480935, + "learning_rate": 1.6222435954558435e-06, + "loss": 0.0424, + "step": 15680 + }, + { + "epoch": 2.5067858853584544, + "grad_norm": 0.20379918000728395, + "learning_rate": 1.6020134134266674e-06, + "loss": 0.0449, + "step": 15700 + }, + { + "epoch": 2.5099792431741976, + "grad_norm": 0.3110350981628874, + "learning_rate": 1.5818991751089762e-06, + "loss": 0.0434, + "step": 15720 + }, + { + "epoch": 2.513172600989941, + "grad_norm": 0.18429144606858047, + "learning_rate": 1.5619011582025988e-06, + "loss": 0.0439, + "step": 15740 + }, + { + "epoch": 2.516365958805684, + "grad_norm": 0.1756584956115843, + "learning_rate": 1.5420196388027963e-06, + "loss": 0.0423, + "step": 15760 + }, + { + "epoch": 2.5195593166214274, + "grad_norm": 0.18747969624165203, + "learning_rate": 1.5222548913964508e-06, + "loss": 0.0432, + "step": 15780 + }, + { + "epoch": 2.5227526744371707, + "grad_norm": 0.17351521964113906, + "learning_rate": 1.5026071888582771e-06, + "loss": 0.0428, + "step": 15800 + }, + { + "epoch": 2.525946032252914, + "grad_norm": 0.1763855716931325, + "learning_rate": 1.4830768024470487e-06, + "loss": 0.0437, + "step": 15820 + }, + { + "epoch": 2.529139390068657, + "grad_norm": 0.19172367578038851, + "learning_rate": 1.4636640018018556e-06, + "loss": 0.0436, + "step": 15840 + }, + { + "epoch": 2.5323327478844004, + "grad_norm": 0.18955098367053075, + "learning_rate": 1.4443690549383904e-06, + "loss": 0.0422, + "step": 15860 + }, + { + "epoch": 2.5355261057001437, + "grad_norm": 0.2062297852474484, + "learning_rate": 1.4251922282452356e-06, + "loss": 0.0423, + "step": 15880 + }, + { + "epoch": 2.538719463515887, + "grad_norm": 0.184016665131291, + "learning_rate": 1.4061337864801916e-06, + "loss": 0.0441, + "step": 15900 + }, + { + "epoch": 2.54191282133163, + "grad_norm": 0.21880976113017805, + "learning_rate": 1.3871939927666189e-06, + "loss": 0.046, + "step": 15920 + }, + { + "epoch": 2.5451061791473735, + "grad_norm": 0.17335074095350983, + "learning_rate": 1.3683731085898144e-06, + "loss": 0.0441, + "step": 15940 + }, + { + "epoch": 2.5482995369631167, + "grad_norm": 0.19234479041549446, + "learning_rate": 1.349671393793388e-06, + "loss": 0.0427, + "step": 15960 + }, + { + "epoch": 2.55149289477886, + "grad_norm": 0.18631232012636342, + "learning_rate": 1.3310891065756814e-06, + "loss": 0.0435, + "step": 15980 + }, + { + "epoch": 2.5546862525946032, + "grad_norm": 0.19243767802224285, + "learning_rate": 1.3126265034862084e-06, + "loss": 0.0441, + "step": 16000 + }, + { + "epoch": 2.5578796104103465, + "grad_norm": 0.22553668043830372, + "learning_rate": 1.2942838394221002e-06, + "loss": 0.0438, + "step": 16020 + }, + { + "epoch": 2.5610729682260898, + "grad_norm": 0.2414806098978672, + "learning_rate": 1.2760613676246037e-06, + "loss": 0.0455, + "step": 16040 + }, + { + "epoch": 2.564266326041833, + "grad_norm": 0.17562297042382372, + "learning_rate": 1.2579593396755652e-06, + "loss": 0.0437, + "step": 16060 + }, + { + "epoch": 2.5674596838575763, + "grad_norm": 0.1714929007989254, + "learning_rate": 1.2399780054939758e-06, + "loss": 0.0435, + "step": 16080 + }, + { + "epoch": 2.5706530416733195, + "grad_norm": 0.18944429187488632, + "learning_rate": 1.2221176133325097e-06, + "loss": 0.0432, + "step": 16100 + }, + { + "epoch": 2.573846399489063, + "grad_norm": 0.18830587754770226, + "learning_rate": 1.2043784097740951e-06, + "loss": 0.044, + "step": 16120 + }, + { + "epoch": 2.577039757304806, + "grad_norm": 0.20515213794452525, + "learning_rate": 1.1867606397285191e-06, + "loss": 0.0444, + "step": 16140 + }, + { + "epoch": 2.5802331151205493, + "grad_norm": 0.2068320912840683, + "learning_rate": 1.1692645464290441e-06, + "loss": 0.0443, + "step": 16160 + }, + { + "epoch": 2.5834264729362926, + "grad_norm": 0.2065451583149461, + "learning_rate": 1.151890371429042e-06, + "loss": 0.0447, + "step": 16180 + }, + { + "epoch": 2.586619830752036, + "grad_norm": 0.20955876801496184, + "learning_rate": 1.1346383545986629e-06, + "loss": 0.043, + "step": 16200 + }, + { + "epoch": 2.589813188567779, + "grad_norm": 0.18475336946843543, + "learning_rate": 1.117508734121535e-06, + "loss": 0.0439, + "step": 16220 + }, + { + "epoch": 2.5930065463835223, + "grad_norm": 0.19250755490602636, + "learning_rate": 1.1005017464914568e-06, + "loss": 0.0431, + "step": 16240 + }, + { + "epoch": 2.5961999041992656, + "grad_norm": 0.2138444193531275, + "learning_rate": 1.0836176265091448e-06, + "loss": 0.0447, + "step": 16260 + }, + { + "epoch": 2.599393262015009, + "grad_norm": 0.19283181561318452, + "learning_rate": 1.0668566072789876e-06, + "loss": 0.0434, + "step": 16280 + }, + { + "epoch": 2.602586619830752, + "grad_norm": 0.19258136254237682, + "learning_rate": 1.05021892020583e-06, + "loss": 0.0452, + "step": 16300 + }, + { + "epoch": 2.6057799776464954, + "grad_norm": 0.239296573931001, + "learning_rate": 1.0337047949917777e-06, + "loss": 0.0432, + "step": 16320 + }, + { + "epoch": 2.6089733354622386, + "grad_norm": 0.18442185794546465, + "learning_rate": 1.0173144596330231e-06, + "loss": 0.0439, + "step": 16340 + }, + { + "epoch": 2.612166693277982, + "grad_norm": 0.17759720874685755, + "learning_rate": 1.0010481404166972e-06, + "loss": 0.0434, + "step": 16360 + }, + { + "epoch": 2.615360051093725, + "grad_norm": 0.1999834786965281, + "learning_rate": 9.849060619177553e-07, + "loss": 0.0446, + "step": 16380 + }, + { + "epoch": 2.6185534089094684, + "grad_norm": 0.21313365667220596, + "learning_rate": 9.688884469958604e-07, + "loss": 0.0434, + "step": 16400 + }, + { + "epoch": 2.6217467667252117, + "grad_norm": 0.19320209419752543, + "learning_rate": 9.5299551679232e-07, + "loss": 0.0445, + "step": 16420 + }, + { + "epoch": 2.624940124540955, + "grad_norm": 0.17847623577962735, + "learning_rate": 9.372274907270251e-07, + "loss": 0.0437, + "step": 16440 + }, + { + "epoch": 2.628133482356698, + "grad_norm": 0.23166885515187532, + "learning_rate": 9.215845864954287e-07, + "loss": 0.0419, + "step": 16460 + }, + { + "epoch": 2.6313268401724415, + "grad_norm": 0.18325681984081477, + "learning_rate": 9.060670200655286e-07, + "loss": 0.0439, + "step": 16480 + }, + { + "epoch": 2.6345201979881847, + "grad_norm": 0.20540975477642068, + "learning_rate": 8.906750056748947e-07, + "loss": 0.0448, + "step": 16500 + }, + { + "epoch": 2.637713555803928, + "grad_norm": 0.1786617783763284, + "learning_rate": 8.754087558277113e-07, + "loss": 0.0444, + "step": 16520 + }, + { + "epoch": 2.6409069136196712, + "grad_norm": 0.1901267431080617, + "learning_rate": 8.602684812918416e-07, + "loss": 0.0438, + "step": 16540 + }, + { + "epoch": 2.6441002714354145, + "grad_norm": 0.18259614623005302, + "learning_rate": 8.452543910959121e-07, + "loss": 0.0432, + "step": 16560 + }, + { + "epoch": 2.6472936292511577, + "grad_norm": 0.18713135077039142, + "learning_rate": 8.303666925264331e-07, + "loss": 0.0437, + "step": 16580 + }, + { + "epoch": 2.650486987066901, + "grad_norm": 0.1801858452235725, + "learning_rate": 8.156055911249394e-07, + "loss": 0.0448, + "step": 16600 + }, + { + "epoch": 2.6536803448826443, + "grad_norm": 0.17771380124624228, + "learning_rate": 8.00971290685143e-07, + "loss": 0.0445, + "step": 16620 + }, + { + "epoch": 2.6568737026983875, + "grad_norm": 0.22250062270982698, + "learning_rate": 7.864639932501294e-07, + "loss": 0.0427, + "step": 16640 + }, + { + "epoch": 2.6600670605141303, + "grad_norm": 0.20866465188062733, + "learning_rate": 7.720838991095602e-07, + "loss": 0.0427, + "step": 16660 + }, + { + "epoch": 2.663260418329874, + "grad_norm": 0.2055356708135395, + "learning_rate": 7.578312067969162e-07, + "loss": 0.043, + "step": 16680 + }, + { + "epoch": 2.666453776145617, + "grad_norm": 0.20698005060615937, + "learning_rate": 7.437061130867473e-07, + "loss": 0.0442, + "step": 16700 + }, + { + "epoch": 2.6696471339613606, + "grad_norm": 0.20876117607511466, + "learning_rate": 7.297088129919616e-07, + "loss": 0.0498, + "step": 16720 + }, + { + "epoch": 2.6728404917771034, + "grad_norm": 0.24032862358776724, + "learning_rate": 7.158394997611329e-07, + "loss": 0.0429, + "step": 16740 + }, + { + "epoch": 2.676033849592847, + "grad_norm": 0.20969273760927634, + "learning_rate": 7.020983648758318e-07, + "loss": 0.0447, + "step": 16760 + }, + { + "epoch": 2.67922720740859, + "grad_norm": 0.2174374325052259, + "learning_rate": 6.884855980479777e-07, + "loss": 0.0452, + "step": 16780 + }, + { + "epoch": 2.6824205652243336, + "grad_norm": 0.18004577133887417, + "learning_rate": 6.750013872172301e-07, + "loss": 0.0438, + "step": 16800 + }, + { + "epoch": 2.6856139230400764, + "grad_norm": 0.2035569950209219, + "learning_rate": 6.616459185483793e-07, + "loss": 0.0438, + "step": 16820 + }, + { + "epoch": 2.68880728085582, + "grad_norm": 0.20132465630515528, + "learning_rate": 6.484193764287938e-07, + "loss": 0.0445, + "step": 16840 + }, + { + "epoch": 2.692000638671563, + "grad_norm": 0.1712570311869676, + "learning_rate": 6.353219434658587e-07, + "loss": 0.0432, + "step": 16860 + }, + { + "epoch": 2.6951939964873066, + "grad_norm": 0.19144286472815933, + "learning_rate": 6.223538004844587e-07, + "loss": 0.0426, + "step": 16880 + }, + { + "epoch": 2.6983873543030494, + "grad_norm": 0.1761969500556086, + "learning_rate": 6.095151265244937e-07, + "loss": 0.0436, + "step": 16900 + }, + { + "epoch": 2.701580712118793, + "grad_norm": 0.18412941719997428, + "learning_rate": 5.968060988383884e-07, + "loss": 0.0419, + "step": 16920 + }, + { + "epoch": 2.704774069934536, + "grad_norm": 0.2088468477123862, + "learning_rate": 5.842268928886563e-07, + "loss": 0.0435, + "step": 16940 + }, + { + "epoch": 2.7079674277502797, + "grad_norm": 0.21087568774149862, + "learning_rate": 5.717776823454746e-07, + "loss": 0.0434, + "step": 16960 + }, + { + "epoch": 2.7111607855660225, + "grad_norm": 0.20533012449268137, + "learning_rate": 5.594586390842915e-07, + "loss": 0.0436, + "step": 16980 + }, + { + "epoch": 2.714354143381766, + "grad_norm": 0.23130477787372275, + "learning_rate": 5.472699331834408e-07, + "loss": 0.0434, + "step": 17000 + }, + { + "epoch": 2.717547501197509, + "grad_norm": 0.19246797825033052, + "learning_rate": 5.352117329218065e-07, + "loss": 0.0443, + "step": 17020 + }, + { + "epoch": 2.7207408590132527, + "grad_norm": 0.19825650332574749, + "learning_rate": 5.23284204776493e-07, + "loss": 0.0432, + "step": 17040 + }, + { + "epoch": 2.7239342168289955, + "grad_norm": 0.19435989820475502, + "learning_rate": 5.1148751342053e-07, + "loss": 0.0437, + "step": 17060 + }, + { + "epoch": 2.727127574644739, + "grad_norm": 0.17105286427984273, + "learning_rate": 4.998218217205941e-07, + "loss": 0.0431, + "step": 17080 + }, + { + "epoch": 2.730320932460482, + "grad_norm": 0.2076555517606956, + "learning_rate": 4.882872907347657e-07, + "loss": 0.0441, + "step": 17100 + }, + { + "epoch": 2.7335142902762257, + "grad_norm": 0.17467573768445724, + "learning_rate": 4.768840797103014e-07, + "loss": 0.0426, + "step": 17120 + }, + { + "epoch": 2.7367076480919685, + "grad_norm": 0.23656714472082974, + "learning_rate": 4.6561234608143993e-07, + "loss": 0.0442, + "step": 17140 + }, + { + "epoch": 2.739901005907712, + "grad_norm": 0.1991265479506836, + "learning_rate": 4.544722454672223e-07, + "loss": 0.0443, + "step": 17160 + }, + { + "epoch": 2.743094363723455, + "grad_norm": 0.16764542580219924, + "learning_rate": 4.434639316693479e-07, + "loss": 0.0441, + "step": 17180 + }, + { + "epoch": 2.7462877215391983, + "grad_norm": 0.18540914909816514, + "learning_rate": 4.3258755667005104e-07, + "loss": 0.0427, + "step": 17200 + }, + { + "epoch": 2.7494810793549416, + "grad_norm": 0.16756011986354746, + "learning_rate": 4.218432706300013e-07, + "loss": 0.0442, + "step": 17220 + }, + { + "epoch": 2.752674437170685, + "grad_norm": 0.19477880662403543, + "learning_rate": 4.1123122188623024e-07, + "loss": 0.0419, + "step": 17240 + }, + { + "epoch": 2.755867794986428, + "grad_norm": 0.16692137735923454, + "learning_rate": 4.0075155695008193e-07, + "loss": 0.0439, + "step": 17260 + }, + { + "epoch": 2.7590611528021713, + "grad_norm": 0.27371092487152754, + "learning_rate": 3.904044205051938e-07, + "loss": 0.0415, + "step": 17280 + }, + { + "epoch": 2.7622545106179146, + "grad_norm": 0.1730044575542229, + "learning_rate": 3.801899554055011e-07, + "loss": 0.0434, + "step": 17300 + }, + { + "epoch": 2.765447868433658, + "grad_norm": 0.2957249889697754, + "learning_rate": 3.7010830267325546e-07, + "loss": 0.0432, + "step": 17320 + }, + { + "epoch": 2.768641226249401, + "grad_norm": 0.20211132503418788, + "learning_rate": 3.601596014970843e-07, + "loss": 0.0448, + "step": 17340 + }, + { + "epoch": 2.7718345840651444, + "grad_norm": 0.2192148080396869, + "learning_rate": 3.5034398923007195e-07, + "loss": 0.0429, + "step": 17360 + }, + { + "epoch": 2.7750279418808876, + "grad_norm": 0.19416701667619607, + "learning_rate": 3.40661601387855e-07, + "loss": 0.0442, + "step": 17380 + }, + { + "epoch": 2.778221299696631, + "grad_norm": 0.2194341949029401, + "learning_rate": 3.311125716467578e-07, + "loss": 0.0451, + "step": 17400 + }, + { + "epoch": 2.781414657512374, + "grad_norm": 0.23997919053006325, + "learning_rate": 3.216970318419488e-07, + "loss": 0.0433, + "step": 17420 + }, + { + "epoch": 2.7846080153281174, + "grad_norm": 0.20048137685529088, + "learning_rate": 3.1241511196561045e-07, + "loss": 0.0436, + "step": 17440 + }, + { + "epoch": 2.7878013731438607, + "grad_norm": 0.18418386343058352, + "learning_rate": 3.0326694016515555e-07, + "loss": 0.0431, + "step": 17460 + }, + { + "epoch": 2.790994730959604, + "grad_norm": 0.18647531186123847, + "learning_rate": 2.9425264274144937e-07, + "loss": 0.0441, + "step": 17480 + }, + { + "epoch": 2.794188088775347, + "grad_norm": 0.18103520276457064, + "learning_rate": 2.8537234414707573e-07, + "loss": 0.0424, + "step": 17500 + }, + { + "epoch": 2.7973814465910904, + "grad_norm": 0.175838788085868, + "learning_rate": 2.766261669846071e-07, + "loss": 0.0428, + "step": 17520 + }, + { + "epoch": 2.8005748044068337, + "grad_norm": 0.18597288140297774, + "learning_rate": 2.680142320049195e-07, + "loss": 0.0461, + "step": 17540 + }, + { + "epoch": 2.803768162222577, + "grad_norm": 0.19306825995055335, + "learning_rate": 2.5953665810552586e-07, + "loss": 0.0432, + "step": 17560 + }, + { + "epoch": 2.8069615200383202, + "grad_norm": 0.19244074182083917, + "learning_rate": 2.5119356232892965e-07, + "loss": 0.0447, + "step": 17580 + }, + { + "epoch": 2.8101548778540635, + "grad_norm": 0.20041935845397732, + "learning_rate": 2.4298505986101397e-07, + "loss": 0.0417, + "step": 17600 + }, + { + "epoch": 2.8133482356698067, + "grad_norm": 0.1897352035064278, + "learning_rate": 2.3491126402944597e-07, + "loss": 0.0447, + "step": 17620 + }, + { + "epoch": 2.81654159348555, + "grad_norm": 0.1859749113332233, + "learning_rate": 2.269722863021162e-07, + "loss": 0.0441, + "step": 17640 + }, + { + "epoch": 2.8197349513012933, + "grad_norm": 0.18154530556190202, + "learning_rate": 2.191682362856018e-07, + "loss": 0.0449, + "step": 17660 + }, + { + "epoch": 2.8229283091170365, + "grad_norm": 0.19576462753720822, + "learning_rate": 2.1149922172364557e-07, + "loss": 0.043, + "step": 17680 + }, + { + "epoch": 2.8261216669327798, + "grad_norm": 0.19317600637380156, + "learning_rate": 2.0396534849567384e-07, + "loss": 0.0435, + "step": 17700 + }, + { + "epoch": 2.829315024748523, + "grad_norm": 0.18270539963789229, + "learning_rate": 1.9656672061533876e-07, + "loss": 0.0448, + "step": 17720 + }, + { + "epoch": 2.8325083825642663, + "grad_norm": 0.25190362174641373, + "learning_rate": 1.8930344022907055e-07, + "loss": 0.0433, + "step": 17740 + }, + { + "epoch": 2.8357017403800096, + "grad_norm": 0.19271629305777457, + "learning_rate": 1.8217560761467744e-07, + "loss": 0.0442, + "step": 17760 + }, + { + "epoch": 2.838895098195753, + "grad_norm": 0.6386981477198299, + "learning_rate": 1.7518332117995695e-07, + "loss": 0.0431, + "step": 17780 + }, + { + "epoch": 2.842088456011496, + "grad_norm": 0.20346250081845807, + "learning_rate": 1.6832667746134236e-07, + "loss": 0.0422, + "step": 17800 + }, + { + "epoch": 2.8452818138272393, + "grad_norm": 0.17777460027714007, + "learning_rate": 1.6160577112255827e-07, + "loss": 0.0425, + "step": 17820 + }, + { + "epoch": 2.8484751716429826, + "grad_norm": 0.255413137859645, + "learning_rate": 1.5502069495332616e-07, + "loss": 0.0435, + "step": 17840 + }, + { + "epoch": 2.851668529458726, + "grad_norm": 0.19607428087584267, + "learning_rate": 1.4857153986807649e-07, + "loss": 0.0418, + "step": 17860 + }, + { + "epoch": 2.854861887274469, + "grad_norm": 0.1780772888139799, + "learning_rate": 1.4225839490469628e-07, + "loss": 0.0427, + "step": 17880 + }, + { + "epoch": 2.8580552450902124, + "grad_norm": 0.21241047060680943, + "learning_rate": 1.3608134722329803e-07, + "loss": 0.0437, + "step": 17900 + }, + { + "epoch": 2.8612486029059556, + "grad_norm": 0.19239115510673255, + "learning_rate": 1.3004048210501718e-07, + "loss": 0.0434, + "step": 17920 + }, + { + "epoch": 2.864441960721699, + "grad_norm": 0.18795522932841213, + "learning_rate": 1.2413588295083656e-07, + "loss": 0.0431, + "step": 17940 + }, + { + "epoch": 2.867635318537442, + "grad_norm": 0.18585931164828967, + "learning_rate": 1.183676312804305e-07, + "loss": 0.0442, + "step": 17960 + }, + { + "epoch": 2.8708286763531854, + "grad_norm": 0.18075501501439709, + "learning_rate": 1.1273580673104245e-07, + "loss": 0.0444, + "step": 17980 + }, + { + "epoch": 2.8740220341689287, + "grad_norm": 0.19563735408076433, + "learning_rate": 1.072404870563859e-07, + "loss": 0.0447, + "step": 18000 + }, + { + "epoch": 2.877215391984672, + "grad_norm": 0.19825850897569677, + "learning_rate": 1.0188174812557073e-07, + "loss": 0.0439, + "step": 18020 + }, + { + "epoch": 2.880408749800415, + "grad_norm": 0.17410835997084562, + "learning_rate": 9.665966392205295e-08, + "loss": 0.0446, + "step": 18040 + }, + { + "epoch": 2.8836021076161584, + "grad_norm": 0.17894750425194603, + "learning_rate": 9.157430654261778e-08, + "loss": 0.0444, + "step": 18060 + }, + { + "epoch": 2.8867954654319017, + "grad_norm": 0.1932898053763739, + "learning_rate": 8.662574619637931e-08, + "loss": 0.043, + "step": 18080 + }, + { + "epoch": 2.889988823247645, + "grad_norm": 0.19451425195215136, + "learning_rate": 8.18140512038157e-08, + "loss": 0.0428, + "step": 18100 + }, + { + "epoch": 2.893182181063388, + "grad_norm": 0.18451759369547344, + "learning_rate": 7.713928799582215e-08, + "loss": 0.0443, + "step": 18120 + }, + { + "epoch": 2.8963755388791315, + "grad_norm": 0.21235909068408473, + "learning_rate": 7.260152111279839e-08, + "loss": 0.0443, + "step": 18140 + }, + { + "epoch": 2.8995688966948747, + "grad_norm": 0.18028750928095402, + "learning_rate": 6.82008132037515e-08, + "loss": 0.0425, + "step": 18160 + }, + { + "epoch": 2.902762254510618, + "grad_norm": 0.1865997727595832, + "learning_rate": 6.393722502543665e-08, + "loss": 0.045, + "step": 18180 + }, + { + "epoch": 2.9059556123263612, + "grad_norm": 0.18553943543624984, + "learning_rate": 5.981081544151446e-08, + "loss": 0.0428, + "step": 18200 + }, + { + "epoch": 2.9091489701421045, + "grad_norm": 0.19032355954882516, + "learning_rate": 5.5821641421741625e-08, + "loss": 0.0443, + "step": 18220 + }, + { + "epoch": 2.9123423279578478, + "grad_norm": 0.18084808651831624, + "learning_rate": 5.196975804117932e-08, + "loss": 0.0435, + "step": 18240 + }, + { + "epoch": 2.915535685773591, + "grad_norm": 0.21753074838538441, + "learning_rate": 4.825521847944048e-08, + "loss": 0.0418, + "step": 18260 + }, + { + "epoch": 2.9187290435893343, + "grad_norm": 0.1883119176872824, + "learning_rate": 4.467807401994706e-08, + "loss": 0.0426, + "step": 18280 + }, + { + "epoch": 2.9219224014050775, + "grad_norm": 0.17894355455146954, + "learning_rate": 4.123837404922726e-08, + "loss": 0.0429, + "step": 18300 + }, + { + "epoch": 2.925115759220821, + "grad_norm": 0.20477512942702414, + "learning_rate": 3.7936166056233845e-08, + "loss": 0.0421, + "step": 18320 + }, + { + "epoch": 2.928309117036564, + "grad_norm": 0.17982986336579576, + "learning_rate": 3.4771495631686914e-08, + "loss": 0.0433, + "step": 18340 + }, + { + "epoch": 2.9315024748523073, + "grad_norm": 0.19778942398473365, + "learning_rate": 3.174440646744326e-08, + "loss": 0.0434, + "step": 18360 + }, + { + "epoch": 2.9346958326680506, + "grad_norm": 0.1840797815880338, + "learning_rate": 2.8854940355895756e-08, + "loss": 0.0422, + "step": 18380 + }, + { + "epoch": 2.937889190483794, + "grad_norm": 0.20492139151779767, + "learning_rate": 2.6103137189394945e-08, + "loss": 0.0433, + "step": 18400 + }, + { + "epoch": 2.941082548299537, + "grad_norm": 0.18649980327789625, + "learning_rate": 2.3489034959698342e-08, + "loss": 0.0423, + "step": 18420 + }, + { + "epoch": 2.9442759061152803, + "grad_norm": 0.18710560587786274, + "learning_rate": 2.1012669757446423e-08, + "loss": 0.0447, + "step": 18440 + }, + { + "epoch": 2.9474692639310236, + "grad_norm": 0.1950435200815635, + "learning_rate": 1.8674075771665246e-08, + "loss": 0.0441, + "step": 18460 + }, + { + "epoch": 2.950662621746767, + "grad_norm": 0.23718279280034166, + "learning_rate": 1.647328528929126e-08, + "loss": 0.0443, + "step": 18480 + }, + { + "epoch": 2.95385597956251, + "grad_norm": 0.1828813035697597, + "learning_rate": 1.441032869472725e-08, + "loss": 0.0434, + "step": 18500 + }, + { + "epoch": 2.9570493373782534, + "grad_norm": 0.18330846523906766, + "learning_rate": 1.2485234469425955e-08, + "loss": 0.0447, + "step": 18520 + }, + { + "epoch": 2.9602426951939966, + "grad_norm": 0.19409145822202675, + "learning_rate": 1.0698029191491543e-08, + "loss": 0.0424, + "step": 18540 + }, + { + "epoch": 2.96343605300974, + "grad_norm": 0.19044949836984276, + "learning_rate": 9.048737535317654e-09, + "loss": 0.0421, + "step": 18560 + }, + { + "epoch": 2.966629410825483, + "grad_norm": 0.24441457630679606, + "learning_rate": 7.5373822712399e-09, + "loss": 0.0429, + "step": 18580 + }, + { + "epoch": 2.9698227686412264, + "grad_norm": 0.17238090085354812, + "learning_rate": 6.163984265230571e-09, + "loss": 0.0436, + "step": 18600 + }, + { + "epoch": 2.9730161264569697, + "grad_norm": 0.17724705729907833, + "learning_rate": 4.928562478603294e-09, + "loss": 0.0438, + "step": 18620 + }, + { + "epoch": 2.9762094842727125, + "grad_norm": 0.17813664105789478, + "learning_rate": 3.831133967754363e-09, + "loss": 0.0443, + "step": 18640 + }, + { + "epoch": 2.979402842088456, + "grad_norm": 0.19807197965691153, + "learning_rate": 2.8717138839262638e-09, + "loss": 0.0423, + "step": 18660 + }, + { + "epoch": 2.982596199904199, + "grad_norm": 0.20792602007313574, + "learning_rate": 2.050315473000053e-09, + "loss": 0.0437, + "step": 18680 + }, + { + "epoch": 2.9857895577199427, + "grad_norm": 0.18039436629311245, + "learning_rate": 1.3669500753099586e-09, + "loss": 0.0449, + "step": 18700 + }, + { + "epoch": 2.9889829155356855, + "grad_norm": 0.16777557295223433, + "learning_rate": 8.216271254901653e-10, + "loss": 0.0433, + "step": 18720 + }, + { + "epoch": 2.992176273351429, + "grad_norm": 0.21329586917274732, + "learning_rate": 4.1435415233936903e-10, + "loss": 0.0437, + "step": 18740 + }, + { + "epoch": 2.995369631167172, + "grad_norm": 0.17382850175198178, + "learning_rate": 1.451367787230762e-10, + "loss": 0.0434, + "step": 18760 + }, + { + "epoch": 2.9985629889829157, + "grad_norm": 0.19059012257580193, + "learning_rate": 1.3978721492557968e-11, + "loss": 0.0444, + "step": 18780 + } + ], + "logging_steps": 20, + "max_steps": 18789, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6401980526886912.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}