{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997152349311818, "eval_steps": 500, "global_step": 2633, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00037968675842429994, "grad_norm": 4.039186105207878, "learning_rate": 3.7878787878787882e-06, "loss": 3.1037, "step": 1 }, { "epoch": 0.0018984337921214998, "grad_norm": 3.978803446290656, "learning_rate": 1.893939393939394e-05, "loss": 3.0676, "step": 5 }, { "epoch": 0.0037968675842429997, "grad_norm": 3.4024459371591336, "learning_rate": 3.787878787878788e-05, "loss": 3.0058, "step": 10 }, { "epoch": 0.005695301376364499, "grad_norm": 0.769937135761119, "learning_rate": 5.681818181818182e-05, "loss": 2.7987, "step": 15 }, { "epoch": 0.007593735168485999, "grad_norm": 0.7081139470071718, "learning_rate": 7.575757575757576e-05, "loss": 2.6742, "step": 20 }, { "epoch": 0.009492168960607499, "grad_norm": 0.5663597400653296, "learning_rate": 9.46969696969697e-05, "loss": 2.5936, "step": 25 }, { "epoch": 0.011390602752728999, "grad_norm": 0.3713010333300208, "learning_rate": 0.00011363636363636364, "loss": 2.5012, "step": 30 }, { "epoch": 0.013289036544850499, "grad_norm": 0.4205375561732573, "learning_rate": 0.00013257575757575756, "loss": 2.433, "step": 35 }, { "epoch": 0.015187470336971999, "grad_norm": 0.587958832384919, "learning_rate": 0.00015151515151515152, "loss": 2.4168, "step": 40 }, { "epoch": 0.0170859041290935, "grad_norm": 0.6677663506298054, "learning_rate": 0.00017045454545454544, "loss": 2.3662, "step": 45 }, { "epoch": 0.018984337921214997, "grad_norm": 0.7064561950273625, "learning_rate": 0.0001893939393939394, "loss": 2.359, "step": 50 }, { "epoch": 0.0208827717133365, "grad_norm": 0.35917369578708264, "learning_rate": 0.00020833333333333335, "loss": 2.3354, "step": 55 }, { "epoch": 0.022781205505457997, "grad_norm": 0.39187189416502427, "learning_rate": 0.00022727272727272727, "loss": 2.2868, "step": 60 }, { "epoch": 0.024679639297579496, "grad_norm": 0.3470459858223155, "learning_rate": 0.0002462121212121212, "loss": 2.2955, "step": 65 }, { "epoch": 0.026578073089700997, "grad_norm": 0.6714289041162775, "learning_rate": 0.0002651515151515151, "loss": 2.2607, "step": 70 }, { "epoch": 0.028476506881822496, "grad_norm": 0.43889084397376665, "learning_rate": 0.00028409090909090913, "loss": 2.2774, "step": 75 }, { "epoch": 0.030374940673943997, "grad_norm": 0.35987531792307215, "learning_rate": 0.00030303030303030303, "loss": 2.2579, "step": 80 }, { "epoch": 0.0322733744660655, "grad_norm": 0.45583566123152425, "learning_rate": 0.000321969696969697, "loss": 2.2278, "step": 85 }, { "epoch": 0.034171808258187, "grad_norm": 0.5495955157286524, "learning_rate": 0.0003409090909090909, "loss": 2.2217, "step": 90 }, { "epoch": 0.036070242050308496, "grad_norm": 0.5181443647177656, "learning_rate": 0.0003598484848484849, "loss": 2.2183, "step": 95 }, { "epoch": 0.037968675842429994, "grad_norm": 0.969821145035109, "learning_rate": 0.0003787878787878788, "loss": 2.2112, "step": 100 }, { "epoch": 0.03986710963455149, "grad_norm": 0.2635501239040389, "learning_rate": 0.00039772727272727274, "loss": 2.2081, "step": 105 }, { "epoch": 0.041765543426673, "grad_norm": 0.5715349391832327, "learning_rate": 0.0004166666666666667, "loss": 2.1948, "step": 110 }, { "epoch": 0.043663977218794496, "grad_norm": 0.5248994310808021, "learning_rate": 0.0004356060606060606, "loss": 2.1855, "step": 115 }, { "epoch": 0.045562411010915994, "grad_norm": 0.5608654765973496, "learning_rate": 0.00045454545454545455, "loss": 2.1841, "step": 120 }, { "epoch": 0.04746084480303749, "grad_norm": 0.379235271981335, "learning_rate": 0.0004734848484848485, "loss": 2.1668, "step": 125 }, { "epoch": 0.04935927859515899, "grad_norm": 0.6934486639382003, "learning_rate": 0.0004924242424242425, "loss": 2.1799, "step": 130 }, { "epoch": 0.051257712387280496, "grad_norm": 0.32726936547789276, "learning_rate": 0.0005113636363636364, "loss": 2.1539, "step": 135 }, { "epoch": 0.053156146179401995, "grad_norm": 0.4103864066901133, "learning_rate": 0.0005303030303030302, "loss": 2.1559, "step": 140 }, { "epoch": 0.05505457997152349, "grad_norm": 0.6157950630213448, "learning_rate": 0.0005492424242424242, "loss": 2.1297, "step": 145 }, { "epoch": 0.05695301376364499, "grad_norm": 1.4548659360346616, "learning_rate": 0.0005681818181818183, "loss": 2.161, "step": 150 }, { "epoch": 0.05885144755576649, "grad_norm": 0.6872944738590898, "learning_rate": 0.0005871212121212122, "loss": 2.1462, "step": 155 }, { "epoch": 0.060749881347887995, "grad_norm": 1.1748112715233312, "learning_rate": 0.0006060606060606061, "loss": 2.1558, "step": 160 }, { "epoch": 0.06264831514000949, "grad_norm": 0.3342527623973018, "learning_rate": 0.000625, "loss": 2.1381, "step": 165 }, { "epoch": 0.064546748932131, "grad_norm": 0.3090958929703956, "learning_rate": 0.000643939393939394, "loss": 2.1311, "step": 170 }, { "epoch": 0.0664451827242525, "grad_norm": 0.4445838850369956, "learning_rate": 0.0006628787878787878, "loss": 2.109, "step": 175 }, { "epoch": 0.068343616516374, "grad_norm": 0.40447800943933176, "learning_rate": 0.0006818181818181818, "loss": 2.1338, "step": 180 }, { "epoch": 0.0702420503084955, "grad_norm": 0.5270438533785088, "learning_rate": 0.0007007575757575758, "loss": 2.1221, "step": 185 }, { "epoch": 0.07214048410061699, "grad_norm": 1.1988668583819744, "learning_rate": 0.0007196969696969698, "loss": 2.0986, "step": 190 }, { "epoch": 0.07403891789273849, "grad_norm": 0.5173087158849319, "learning_rate": 0.0007386363636363636, "loss": 2.097, "step": 195 }, { "epoch": 0.07593735168485999, "grad_norm": 0.8713149911927134, "learning_rate": 0.0007575757575757576, "loss": 2.1171, "step": 200 }, { "epoch": 0.07783578547698149, "grad_norm": 0.43763842661820435, "learning_rate": 0.0007765151515151515, "loss": 2.1231, "step": 205 }, { "epoch": 0.07973421926910298, "grad_norm": 0.37755079956108445, "learning_rate": 0.0007954545454545455, "loss": 2.0974, "step": 210 }, { "epoch": 0.08163265306122448, "grad_norm": 0.3202651052626521, "learning_rate": 0.0008143939393939394, "loss": 2.1061, "step": 215 }, { "epoch": 0.083531086853346, "grad_norm": 1.0697394978550765, "learning_rate": 0.0008333333333333334, "loss": 2.1058, "step": 220 }, { "epoch": 0.0854295206454675, "grad_norm": 0.9419134947997803, "learning_rate": 0.0008522727272727273, "loss": 2.0832, "step": 225 }, { "epoch": 0.08732795443758899, "grad_norm": 0.3177328584436459, "learning_rate": 0.0008712121212121212, "loss": 2.0936, "step": 230 }, { "epoch": 0.08922638822971049, "grad_norm": 0.4260910025738285, "learning_rate": 0.0008901515151515151, "loss": 2.0836, "step": 235 }, { "epoch": 0.09112482202183199, "grad_norm": 0.37197183427696595, "learning_rate": 0.0009090909090909091, "loss": 2.0705, "step": 240 }, { "epoch": 0.09302325581395349, "grad_norm": 0.3469428530219821, "learning_rate": 0.000928030303030303, "loss": 2.0851, "step": 245 }, { "epoch": 0.09492168960607499, "grad_norm": 0.5432864309293935, "learning_rate": 0.000946969696969697, "loss": 2.0601, "step": 250 }, { "epoch": 0.09682012339819648, "grad_norm": 0.30271756878451744, "learning_rate": 0.000965909090909091, "loss": 2.068, "step": 255 }, { "epoch": 0.09871855719031798, "grad_norm": 0.613099552832842, "learning_rate": 0.000984848484848485, "loss": 2.0889, "step": 260 }, { "epoch": 0.1006169909824395, "grad_norm": 0.2879329961644459, "learning_rate": 0.000999999560347478, "loss": 2.0728, "step": 265 }, { "epoch": 0.10251542477456099, "grad_norm": 0.3522360779077239, "learning_rate": 0.000999984172590384, "loss": 2.0778, "step": 270 }, { "epoch": 0.10441385856668249, "grad_norm": 0.8444049085073484, "learning_rate": 0.0009999468029803513, "loss": 2.0645, "step": 275 }, { "epoch": 0.10631229235880399, "grad_norm": 0.2340268385510596, "learning_rate": 0.0009998874531603381, "loss": 2.0424, "step": 280 }, { "epoch": 0.10821072615092549, "grad_norm": 0.3788090273684815, "learning_rate": 0.0009998061257396652, "loss": 2.0373, "step": 285 }, { "epoch": 0.11010915994304699, "grad_norm": 0.4712223089490736, "learning_rate": 0.0009997028242939002, "loss": 2.0535, "step": 290 }, { "epoch": 0.11200759373516848, "grad_norm": 0.5222779081868865, "learning_rate": 0.0009995775533647015, "loss": 2.0383, "step": 295 }, { "epoch": 0.11390602752728998, "grad_norm": 0.38807754757014135, "learning_rate": 0.0009994303184596178, "loss": 2.0536, "step": 300 }, { "epoch": 0.11580446131941148, "grad_norm": 0.5558959816060527, "learning_rate": 0.0009992611260518462, "loss": 2.0476, "step": 305 }, { "epoch": 0.11770289511153298, "grad_norm": 0.5297156243713772, "learning_rate": 0.0009990699835799469, "loss": 2.0182, "step": 310 }, { "epoch": 0.11960132890365449, "grad_norm": 0.8463883226148833, "learning_rate": 0.0009988568994475178, "loss": 2.0527, "step": 315 }, { "epoch": 0.12149976269577599, "grad_norm": 0.3181378787047677, "learning_rate": 0.0009986218830228234, "loss": 2.0253, "step": 320 }, { "epoch": 0.12339819648789749, "grad_norm": 0.3005739806261844, "learning_rate": 0.0009983649446383836, "loss": 2.023, "step": 325 }, { "epoch": 0.12529663028001897, "grad_norm": 0.7577732908772633, "learning_rate": 0.00099808609559052, "loss": 2.0081, "step": 330 }, { "epoch": 0.12719506407214048, "grad_norm": 0.3189810102860473, "learning_rate": 0.0009977853481388575, "loss": 2.0103, "step": 335 }, { "epoch": 0.129093497864262, "grad_norm": 0.6410939817239918, "learning_rate": 0.0009974627155057878, "loss": 2.0029, "step": 340 }, { "epoch": 0.13099193165638348, "grad_norm": 0.606178387164357, "learning_rate": 0.000997118211875886, "loss": 1.9762, "step": 345 }, { "epoch": 0.132890365448505, "grad_norm": 0.3588634601930927, "learning_rate": 0.0009967518523952875, "loss": 1.9911, "step": 350 }, { "epoch": 0.13478879924062648, "grad_norm": 0.28091052944488715, "learning_rate": 0.0009963636531710228, "loss": 1.9776, "step": 355 }, { "epoch": 0.136687233032748, "grad_norm": 0.38172615236997626, "learning_rate": 0.0009959536312703085, "loss": 1.9787, "step": 360 }, { "epoch": 0.13858566682486947, "grad_norm": 0.4360166845102846, "learning_rate": 0.0009955218047197978, "loss": 1.9909, "step": 365 }, { "epoch": 0.140484100616991, "grad_norm": 0.35275497258123356, "learning_rate": 0.000995068192504787, "loss": 1.9889, "step": 370 }, { "epoch": 0.14238253440911247, "grad_norm": 0.6317060282966646, "learning_rate": 0.0009945928145683814, "loss": 1.9551, "step": 375 }, { "epoch": 0.14428096820123398, "grad_norm": 0.3850672622738674, "learning_rate": 0.0009940956918106183, "loss": 1.9585, "step": 380 }, { "epoch": 0.1461794019933555, "grad_norm": 0.4220457511407704, "learning_rate": 0.0009935768460875483, "loss": 1.951, "step": 385 }, { "epoch": 0.14807783578547698, "grad_norm": 0.7783987596579233, "learning_rate": 0.0009930363002102743, "loss": 1.9724, "step": 390 }, { "epoch": 0.1499762695775985, "grad_norm": 0.32401219109209967, "learning_rate": 0.0009924740779439483, "loss": 1.9667, "step": 395 }, { "epoch": 0.15187470336971998, "grad_norm": 0.2857979164581651, "learning_rate": 0.0009918902040067276, "loss": 1.9462, "step": 400 }, { "epoch": 0.1537731371618415, "grad_norm": 0.6210777091831657, "learning_rate": 0.000991284704068686, "loss": 1.9618, "step": 405 }, { "epoch": 0.15567157095396297, "grad_norm": 0.4226037865904592, "learning_rate": 0.0009906576047506884, "loss": 1.9602, "step": 410 }, { "epoch": 0.15757000474608449, "grad_norm": 0.2508966539644492, "learning_rate": 0.0009900089336232166, "loss": 1.9518, "step": 415 }, { "epoch": 0.15946843853820597, "grad_norm": 0.28625033438978126, "learning_rate": 0.0009893387192051607, "loss": 1.937, "step": 420 }, { "epoch": 0.16136687233032748, "grad_norm": 0.5695158778386894, "learning_rate": 0.0009886469909625624, "loss": 1.9356, "step": 425 }, { "epoch": 0.16326530612244897, "grad_norm": 0.3444114501804719, "learning_rate": 0.0009879337793073219, "loss": 1.9292, "step": 430 }, { "epoch": 0.16516373991457048, "grad_norm": 0.3562145151770506, "learning_rate": 0.000987199115595859, "loss": 1.9446, "step": 435 }, { "epoch": 0.167062173706692, "grad_norm": 0.6662044045300556, "learning_rate": 0.0009864430321277354, "loss": 1.9123, "step": 440 }, { "epoch": 0.16896060749881348, "grad_norm": 0.4008170455675707, "learning_rate": 0.0009856655621442347, "loss": 1.9153, "step": 445 }, { "epoch": 0.170859041290935, "grad_norm": 0.5501520320546401, "learning_rate": 0.0009848667398269005, "loss": 1.9134, "step": 450 }, { "epoch": 0.17275747508305647, "grad_norm": 0.6229887015853178, "learning_rate": 0.000984046600296034, "loss": 1.9161, "step": 455 }, { "epoch": 0.17465590887517798, "grad_norm": 0.3263959879918121, "learning_rate": 0.0009832051796091496, "loss": 1.8977, "step": 460 }, { "epoch": 0.17655434266729947, "grad_norm": 0.32030075439432676, "learning_rate": 0.00098234251475939, "loss": 1.9385, "step": 465 }, { "epoch": 0.17845277645942098, "grad_norm": 0.3241713798628184, "learning_rate": 0.0009814586436738997, "loss": 1.8839, "step": 470 }, { "epoch": 0.18035121025154247, "grad_norm": 0.710599959000179, "learning_rate": 0.0009805536052121568, "loss": 1.8957, "step": 475 }, { "epoch": 0.18224964404366398, "grad_norm": 0.5685009329132018, "learning_rate": 0.000979627439164266, "loss": 1.9039, "step": 480 }, { "epoch": 0.1841480778357855, "grad_norm": 0.2883177054081686, "learning_rate": 0.0009786801862492075, "loss": 1.8939, "step": 485 }, { "epoch": 0.18604651162790697, "grad_norm": 0.3306718558813855, "learning_rate": 0.0009777118881130484, "loss": 1.8871, "step": 490 }, { "epoch": 0.1879449454200285, "grad_norm": 0.27771057670095894, "learning_rate": 0.000976722587327111, "loss": 1.8709, "step": 495 }, { "epoch": 0.18984337921214997, "grad_norm": 0.39767226694729324, "learning_rate": 0.0009757123273861006, "loss": 1.8707, "step": 500 }, { "epoch": 0.19174181300427148, "grad_norm": 0.3486146634838498, "learning_rate": 0.0009746811527061942, "loss": 1.854, "step": 505 }, { "epoch": 0.19364024679639297, "grad_norm": 0.4080559390296307, "learning_rate": 0.000973629108623087, "loss": 1.8698, "step": 510 }, { "epoch": 0.19553868058851448, "grad_norm": 0.33081602292257706, "learning_rate": 0.0009725562413900002, "loss": 1.8464, "step": 515 }, { "epoch": 0.19743711438063596, "grad_norm": 0.6037306322904349, "learning_rate": 0.000971462598175646, "loss": 1.8439, "step": 520 }, { "epoch": 0.19933554817275748, "grad_norm": 0.2879376497544211, "learning_rate": 0.0009703482270621553, "loss": 1.8713, "step": 525 }, { "epoch": 0.201233981964879, "grad_norm": 0.6788567524588369, "learning_rate": 0.0009692131770429629, "loss": 1.8453, "step": 530 }, { "epoch": 0.20313241575700047, "grad_norm": 0.3161414574102799, "learning_rate": 0.0009680574980206534, "loss": 1.8594, "step": 535 }, { "epoch": 0.20503084954912199, "grad_norm": 0.5645515053171333, "learning_rate": 0.0009668812408047678, "loss": 1.8492, "step": 540 }, { "epoch": 0.20692928334124347, "grad_norm": 0.5534701462686209, "learning_rate": 0.0009656844571095696, "loss": 1.8465, "step": 545 }, { "epoch": 0.20882771713336498, "grad_norm": 0.2769884694724677, "learning_rate": 0.0009644671995517705, "loss": 1.8512, "step": 550 }, { "epoch": 0.21072615092548647, "grad_norm": 0.3389677879460361, "learning_rate": 0.0009632295216482181, "loss": 1.8338, "step": 555 }, { "epoch": 0.21262458471760798, "grad_norm": 0.3665481797512662, "learning_rate": 0.000961971477813542, "loss": 1.818, "step": 560 }, { "epoch": 0.21452301850972946, "grad_norm": 0.5102461937389484, "learning_rate": 0.0009606931233577622, "loss": 1.8304, "step": 565 }, { "epoch": 0.21642145230185098, "grad_norm": 0.31696862342234344, "learning_rate": 0.0009593945144838571, "loss": 1.8491, "step": 570 }, { "epoch": 0.21831988609397246, "grad_norm": 0.474051209621524, "learning_rate": 0.0009580757082852929, "loss": 1.7842, "step": 575 }, { "epoch": 0.22021831988609397, "grad_norm": 0.28232130793234095, "learning_rate": 0.0009567367627435121, "loss": 1.8143, "step": 580 }, { "epoch": 0.22211675367821548, "grad_norm": 0.3429346941858972, "learning_rate": 0.0009553777367253867, "loss": 1.8144, "step": 585 }, { "epoch": 0.22401518747033697, "grad_norm": 0.4168698468178576, "learning_rate": 0.0009539986899806281, "loss": 1.8069, "step": 590 }, { "epoch": 0.22591362126245848, "grad_norm": 0.2584426546217752, "learning_rate": 0.0009525996831391607, "loss": 1.8121, "step": 595 }, { "epoch": 0.22781205505457996, "grad_norm": 0.27734758082061345, "learning_rate": 0.0009511807777084571, "loss": 1.8144, "step": 600 }, { "epoch": 0.22971048884670148, "grad_norm": 0.29149160986727496, "learning_rate": 0.0009497420360708331, "loss": 1.8072, "step": 605 }, { "epoch": 0.23160892263882296, "grad_norm": 0.27795468706904236, "learning_rate": 0.0009482835214807049, "loss": 1.7843, "step": 610 }, { "epoch": 0.23350735643094447, "grad_norm": 0.37668313657202124, "learning_rate": 0.0009468052980618091, "loss": 1.7858, "step": 615 }, { "epoch": 0.23540579022306596, "grad_norm": 0.3635355026922671, "learning_rate": 0.0009453074308043822, "loss": 1.7828, "step": 620 }, { "epoch": 0.23730422401518747, "grad_norm": 0.5492319693073675, "learning_rate": 0.0009437899855623046, "loss": 1.7805, "step": 625 }, { "epoch": 0.23920265780730898, "grad_norm": 0.5045520053744543, "learning_rate": 0.0009422530290502045, "loss": 1.784, "step": 630 }, { "epoch": 0.24110109159943047, "grad_norm": 0.29163131272581627, "learning_rate": 0.0009406966288405248, "loss": 1.7719, "step": 635 }, { "epoch": 0.24299952539155198, "grad_norm": 0.48505663312724284, "learning_rate": 0.0009391208533605527, "loss": 1.7866, "step": 640 }, { "epoch": 0.24489795918367346, "grad_norm": 0.5368154913313934, "learning_rate": 0.0009375257718894107, "loss": 1.7909, "step": 645 }, { "epoch": 0.24679639297579498, "grad_norm": 0.35856621494537494, "learning_rate": 0.0009359114545550116, "loss": 1.7504, "step": 650 }, { "epoch": 0.24869482676791646, "grad_norm": 0.3659739590782703, "learning_rate": 0.0009342779723309745, "loss": 1.7605, "step": 655 }, { "epoch": 0.25059326056003794, "grad_norm": 0.511785026274882, "learning_rate": 0.0009326253970335046, "loss": 1.7158, "step": 660 }, { "epoch": 0.25249169435215946, "grad_norm": 0.30511971139113303, "learning_rate": 0.0009309538013182364, "loss": 1.7637, "step": 665 }, { "epoch": 0.25439012814428097, "grad_norm": 0.28292845629768315, "learning_rate": 0.0009292632586770384, "loss": 1.7601, "step": 670 }, { "epoch": 0.2562885619364025, "grad_norm": 0.24747199414214469, "learning_rate": 0.000927553843434783, "loss": 1.7477, "step": 675 }, { "epoch": 0.258186995728524, "grad_norm": 0.30515982384727836, "learning_rate": 0.0009258256307460781, "loss": 1.757, "step": 680 }, { "epoch": 0.26008542952064545, "grad_norm": 0.4086074740784753, "learning_rate": 0.000924078696591963, "loss": 1.7437, "step": 685 }, { "epoch": 0.26198386331276696, "grad_norm": 0.3606712756700905, "learning_rate": 0.0009223131177765685, "loss": 1.7156, "step": 690 }, { "epoch": 0.2638822971048885, "grad_norm": 0.31644420804415135, "learning_rate": 0.0009205289719237393, "loss": 1.738, "step": 695 }, { "epoch": 0.26578073089701, "grad_norm": 0.47740932792504026, "learning_rate": 0.0009187263374736221, "loss": 1.7548, "step": 700 }, { "epoch": 0.26767916468913144, "grad_norm": 0.2710760174578327, "learning_rate": 0.0009169052936792164, "loss": 1.7425, "step": 705 }, { "epoch": 0.26957759848125296, "grad_norm": 0.27636232385404463, "learning_rate": 0.0009150659206028904, "loss": 1.7267, "step": 710 }, { "epoch": 0.27147603227337447, "grad_norm": 0.4625279056619356, "learning_rate": 0.0009132082991128607, "loss": 1.7387, "step": 715 }, { "epoch": 0.273374466065496, "grad_norm": 4.000898556896865, "learning_rate": 0.0009113325108796374, "loss": 1.745, "step": 720 }, { "epoch": 0.2752728998576175, "grad_norm": 0.38141390080270937, "learning_rate": 0.0009094386383724332, "loss": 1.7522, "step": 725 }, { "epoch": 0.27717133364973895, "grad_norm": 0.35177413794126794, "learning_rate": 0.0009075267648555378, "loss": 1.6868, "step": 730 }, { "epoch": 0.27906976744186046, "grad_norm": 0.8220268286150999, "learning_rate": 0.000905596974384657, "loss": 1.8246, "step": 735 }, { "epoch": 0.280968201233982, "grad_norm": 0.8599631648767855, "learning_rate": 0.0009036493518032172, "loss": 1.738, "step": 740 }, { "epoch": 0.2828666350261035, "grad_norm": 1.0678364091016483, "learning_rate": 0.0009016839827386351, "loss": 1.7384, "step": 745 }, { "epoch": 0.28476506881822494, "grad_norm": 1.8720283645771043, "learning_rate": 0.0008997009535985534, "loss": 1.8852, "step": 750 }, { "epoch": 0.28666350261034645, "grad_norm": 1.1071369931471309, "learning_rate": 0.0008977003515670418, "loss": 1.7642, "step": 755 }, { "epoch": 0.28856193640246797, "grad_norm": 0.9209513939367386, "learning_rate": 0.0008956822646007639, "loss": 1.8114, "step": 760 }, { "epoch": 0.2904603701945895, "grad_norm": 0.4031852696574249, "learning_rate": 0.0008936467814251102, "loss": 1.749, "step": 765 }, { "epoch": 0.292358803986711, "grad_norm": 0.482604929605048, "learning_rate": 0.0008915939915302969, "loss": 1.6956, "step": 770 }, { "epoch": 0.29425723777883245, "grad_norm": 0.31688589446500226, "learning_rate": 0.000889523985167432, "loss": 1.7137, "step": 775 }, { "epoch": 0.29615567157095396, "grad_norm": 0.2662426574337696, "learning_rate": 0.0008874368533445476, "loss": 1.716, "step": 780 }, { "epoch": 0.29805410536307547, "grad_norm": 0.30925480824258833, "learning_rate": 0.0008853326878225978, "loss": 1.6941, "step": 785 }, { "epoch": 0.299952539155197, "grad_norm": 0.3332600934262706, "learning_rate": 0.000883211581111425, "loss": 1.673, "step": 790 }, { "epoch": 0.30185097294731844, "grad_norm": 0.4173857531935143, "learning_rate": 0.0008810736264656929, "loss": 1.677, "step": 795 }, { "epoch": 0.30374940673943995, "grad_norm": 0.3528547476107368, "learning_rate": 0.0008789189178807862, "loss": 1.6878, "step": 800 }, { "epoch": 0.30564784053156147, "grad_norm": 0.35241925687665077, "learning_rate": 0.0008767475500886777, "loss": 1.6895, "step": 805 }, { "epoch": 0.307546274323683, "grad_norm": 0.3235829351373288, "learning_rate": 0.0008745596185537648, "loss": 1.6903, "step": 810 }, { "epoch": 0.30944470811580443, "grad_norm": 0.2948128495705965, "learning_rate": 0.0008723552194686705, "loss": 1.6609, "step": 815 }, { "epoch": 0.31134314190792595, "grad_norm": 0.36422304971890607, "learning_rate": 0.0008701344497500159, "loss": 1.6614, "step": 820 }, { "epoch": 0.31324157570004746, "grad_norm": 0.2825304340578671, "learning_rate": 0.0008678974070341584, "loss": 1.6769, "step": 825 }, { "epoch": 0.31514000949216897, "grad_norm": 0.26538936909594424, "learning_rate": 0.0008656441896728994, "loss": 1.6485, "step": 830 }, { "epoch": 0.3170384432842905, "grad_norm": 0.31905462352993175, "learning_rate": 0.0008633748967291598, "loss": 1.6482, "step": 835 }, { "epoch": 0.31893687707641194, "grad_norm": 0.39448662981546256, "learning_rate": 0.0008610896279726255, "loss": 1.6259, "step": 840 }, { "epoch": 0.32083531086853345, "grad_norm": 0.31299399145300977, "learning_rate": 0.0008587884838753603, "loss": 1.6398, "step": 845 }, { "epoch": 0.32273374466065496, "grad_norm": 0.3247022426656648, "learning_rate": 0.0008564715656073893, "loss": 1.6501, "step": 850 }, { "epoch": 0.3246321784527765, "grad_norm": 0.2859709456607949, "learning_rate": 0.0008541389750322498, "loss": 1.6268, "step": 855 }, { "epoch": 0.32653061224489793, "grad_norm": 0.28129496647987356, "learning_rate": 0.0008517908147025143, "loss": 1.6085, "step": 860 }, { "epoch": 0.32842904603701945, "grad_norm": 0.3456433785224482, "learning_rate": 0.0008494271878552808, "loss": 1.6285, "step": 865 }, { "epoch": 0.33032747982914096, "grad_norm": 0.49566297259294595, "learning_rate": 0.0008470481984076344, "loss": 1.6269, "step": 870 }, { "epoch": 0.33222591362126247, "grad_norm": 0.34226796976284596, "learning_rate": 0.000844653950952078, "loss": 1.5934, "step": 875 }, { "epoch": 0.334124347413384, "grad_norm": 0.2874518927936025, "learning_rate": 0.000842244550751935, "loss": 1.6233, "step": 880 }, { "epoch": 0.33602278120550544, "grad_norm": 0.27909933392041014, "learning_rate": 0.0008398201037367201, "loss": 1.5986, "step": 885 }, { "epoch": 0.33792121499762695, "grad_norm": 0.2755722277474001, "learning_rate": 0.0008373807164974832, "loss": 1.5831, "step": 890 }, { "epoch": 0.33981964878974846, "grad_norm": 0.2647419405762948, "learning_rate": 0.0008349264962821219, "loss": 1.5928, "step": 895 }, { "epoch": 0.34171808258187, "grad_norm": 0.34636953235762286, "learning_rate": 0.0008324575509906677, "loss": 1.5908, "step": 900 }, { "epoch": 0.34361651637399143, "grad_norm": 0.3152387214237848, "learning_rate": 0.0008299739891705413, "loss": 1.5952, "step": 905 }, { "epoch": 0.34551495016611294, "grad_norm": 0.2654394015701188, "learning_rate": 0.0008274759200117803, "loss": 1.5985, "step": 910 }, { "epoch": 0.34741338395823446, "grad_norm": 0.6103144176291811, "learning_rate": 0.0008249634533422392, "loss": 1.5877, "step": 915 }, { "epoch": 0.34931181775035597, "grad_norm": 0.3224755104739263, "learning_rate": 0.0008224366996227604, "loss": 1.5771, "step": 920 }, { "epoch": 0.3512102515424775, "grad_norm": 0.5837396991238525, "learning_rate": 0.0008198957699423175, "loss": 1.5843, "step": 925 }, { "epoch": 0.35310868533459894, "grad_norm": 0.32548897062944837, "learning_rate": 0.000817340776013132, "loss": 1.5851, "step": 930 }, { "epoch": 0.35500711912672045, "grad_norm": 0.31829180968967913, "learning_rate": 0.0008147718301657612, "loss": 1.5942, "step": 935 }, { "epoch": 0.35690555291884196, "grad_norm": 0.2731850877543467, "learning_rate": 0.0008121890453441602, "loss": 1.5694, "step": 940 }, { "epoch": 0.3588039867109635, "grad_norm": 0.24810235775271594, "learning_rate": 0.0008095925351007156, "loss": 1.559, "step": 945 }, { "epoch": 0.36070242050308493, "grad_norm": 0.28169507454726866, "learning_rate": 0.0008069824135912536, "loss": 1.5638, "step": 950 }, { "epoch": 0.36260085429520644, "grad_norm": 0.24752038600316323, "learning_rate": 0.0008043587955700211, "loss": 1.5718, "step": 955 }, { "epoch": 0.36449928808732796, "grad_norm": 0.27876047342351895, "learning_rate": 0.0008017217963846403, "loss": 1.5501, "step": 960 }, { "epoch": 0.36639772187944947, "grad_norm": 0.40897383234845613, "learning_rate": 0.0007990715319710381, "loss": 1.5306, "step": 965 }, { "epoch": 0.368296155671571, "grad_norm": 0.2858525738953562, "learning_rate": 0.0007964081188483476, "loss": 1.5283, "step": 970 }, { "epoch": 0.37019458946369244, "grad_norm": 0.26144274136184015, "learning_rate": 0.0007937316741137871, "loss": 1.5446, "step": 975 }, { "epoch": 0.37209302325581395, "grad_norm": 0.32273246789256727, "learning_rate": 0.0007910423154375101, "loss": 1.5672, "step": 980 }, { "epoch": 0.37399145704793546, "grad_norm": 0.3490759385530748, "learning_rate": 0.0007883401610574337, "loss": 1.5401, "step": 985 }, { "epoch": 0.375889890840057, "grad_norm": 0.28213557009744067, "learning_rate": 0.0007856253297740383, "loss": 1.5533, "step": 990 }, { "epoch": 0.37778832463217843, "grad_norm": 0.33445422182184414, "learning_rate": 0.0007828979409451468, "loss": 1.5361, "step": 995 }, { "epoch": 0.37968675842429994, "grad_norm": 0.3535301287121482, "learning_rate": 0.0007801581144806751, "loss": 1.5189, "step": 1000 }, { "epoch": 0.38158519221642145, "grad_norm": 0.3391675710324959, "learning_rate": 0.0007774059708373606, "loss": 1.5301, "step": 1005 }, { "epoch": 0.38348362600854297, "grad_norm": 0.6452174423374145, "learning_rate": 0.0007746416310134679, "loss": 1.5214, "step": 1010 }, { "epoch": 0.3853820598006645, "grad_norm": 0.43341843816494785, "learning_rate": 0.0007718652165434664, "loss": 1.5235, "step": 1015 }, { "epoch": 0.38728049359278593, "grad_norm": 0.3003373383959457, "learning_rate": 0.0007690768494926897, "loss": 1.5063, "step": 1020 }, { "epoch": 0.38917892738490745, "grad_norm": 0.3078153548021384, "learning_rate": 0.000766276652451967, "loss": 1.5174, "step": 1025 }, { "epoch": 0.39107736117702896, "grad_norm": 0.4798286034200326, "learning_rate": 0.0007634647485322347, "loss": 1.4958, "step": 1030 }, { "epoch": 0.39297579496915047, "grad_norm": 0.2962615700403029, "learning_rate": 0.0007606412613591228, "loss": 1.5187, "step": 1035 }, { "epoch": 0.39487422876127193, "grad_norm": 0.3904932019460365, "learning_rate": 0.0007578063150675206, "loss": 1.4828, "step": 1040 }, { "epoch": 0.39677266255339344, "grad_norm": 0.31811472424437215, "learning_rate": 0.0007549600342961183, "loss": 1.4752, "step": 1045 }, { "epoch": 0.39867109634551495, "grad_norm": 0.4803758991042241, "learning_rate": 0.0007521025441819278, "loss": 1.4969, "step": 1050 }, { "epoch": 0.40056953013763646, "grad_norm": 0.472851525192678, "learning_rate": 0.0007492339703547808, "loss": 1.492, "step": 1055 }, { "epoch": 0.402467963929758, "grad_norm": 0.3908915345807985, "learning_rate": 0.0007463544389318058, "loss": 1.4682, "step": 1060 }, { "epoch": 0.40436639772187943, "grad_norm": 0.3886806997641989, "learning_rate": 0.0007434640765118824, "loss": 1.4759, "step": 1065 }, { "epoch": 0.40626483151400095, "grad_norm": 0.5234433759967211, "learning_rate": 0.0007405630101700769, "loss": 1.4779, "step": 1070 }, { "epoch": 0.40816326530612246, "grad_norm": 0.3607186516700363, "learning_rate": 0.0007376513674520537, "loss": 1.4894, "step": 1075 }, { "epoch": 0.41006169909824397, "grad_norm": 0.3422699133158823, "learning_rate": 0.0007347292763684691, "loss": 1.4713, "step": 1080 }, { "epoch": 0.4119601328903654, "grad_norm": 0.456283403837333, "learning_rate": 0.000731796865389343, "loss": 1.4786, "step": 1085 }, { "epoch": 0.41385856668248694, "grad_norm": 0.29907259245021056, "learning_rate": 0.0007288542634384101, "loss": 1.4428, "step": 1090 }, { "epoch": 0.41575700047460845, "grad_norm": 0.4065968463089126, "learning_rate": 0.0007259015998874521, "loss": 1.4551, "step": 1095 }, { "epoch": 0.41765543426672996, "grad_norm": 0.2851166065497382, "learning_rate": 0.0007229390045506107, "loss": 1.4726, "step": 1100 }, { "epoch": 0.4195538680588514, "grad_norm": 0.34495921420404607, "learning_rate": 0.0007199666076786786, "loss": 1.4491, "step": 1105 }, { "epoch": 0.42145230185097293, "grad_norm": 0.2657939111249429, "learning_rate": 0.0007169845399533742, "loss": 1.4362, "step": 1110 }, { "epoch": 0.42335073564309444, "grad_norm": 0.4937795186011948, "learning_rate": 0.0007139929324815965, "loss": 1.4407, "step": 1115 }, { "epoch": 0.42524916943521596, "grad_norm": 0.4407673687752531, "learning_rate": 0.0007109919167896597, "loss": 1.4319, "step": 1120 }, { "epoch": 0.42714760322733747, "grad_norm": 0.5300468580153148, "learning_rate": 0.0007079816248175114, "loss": 1.4275, "step": 1125 }, { "epoch": 0.4290460370194589, "grad_norm": 0.3241335090242318, "learning_rate": 0.000704962188912932, "loss": 1.4426, "step": 1130 }, { "epoch": 0.43094447081158044, "grad_norm": 0.3968224596845394, "learning_rate": 0.0007019337418257159, "loss": 1.4398, "step": 1135 }, { "epoch": 0.43284290460370195, "grad_norm": 0.3781038122854968, "learning_rate": 0.0006988964167018346, "loss": 1.4298, "step": 1140 }, { "epoch": 0.43474133839582346, "grad_norm": 0.2842174258844352, "learning_rate": 0.0006958503470775836, "loss": 1.4358, "step": 1145 }, { "epoch": 0.4366397721879449, "grad_norm": 0.5316938383381906, "learning_rate": 0.0006927956668737115, "loss": 1.4152, "step": 1150 }, { "epoch": 0.43853820598006643, "grad_norm": 0.26129078645372833, "learning_rate": 0.000689732510389531, "loss": 1.4117, "step": 1155 }, { "epoch": 0.44043663977218794, "grad_norm": 0.45097710133639385, "learning_rate": 0.0006866610122970162, "loss": 1.3814, "step": 1160 }, { "epoch": 0.44233507356430946, "grad_norm": 0.34768384716485046, "learning_rate": 0.0006835813076348805, "loss": 1.409, "step": 1165 }, { "epoch": 0.44423350735643097, "grad_norm": 0.6973633265305714, "learning_rate": 0.0006804935318026396, "loss": 1.4211, "step": 1170 }, { "epoch": 0.4461319411485524, "grad_norm": 0.4342853868362056, "learning_rate": 0.0006773978205546597, "loss": 1.4037, "step": 1175 }, { "epoch": 0.44803037494067394, "grad_norm": 0.46738592482568664, "learning_rate": 0.0006742943099941876, "loss": 1.4309, "step": 1180 }, { "epoch": 0.44992880873279545, "grad_norm": 0.25387391273809895, "learning_rate": 0.000671183136567368, "loss": 1.3923, "step": 1185 }, { "epoch": 0.45182724252491696, "grad_norm": 0.46112835773656796, "learning_rate": 0.0006680644370572444, "loss": 1.4028, "step": 1190 }, { "epoch": 0.4537256763170384, "grad_norm": 0.2551607349467472, "learning_rate": 0.0006649383485777449, "loss": 1.3815, "step": 1195 }, { "epoch": 0.45562411010915993, "grad_norm": 0.37310127138077576, "learning_rate": 0.0006618050085676546, "loss": 1.3911, "step": 1200 }, { "epoch": 0.45752254390128144, "grad_norm": 0.4254847053830724, "learning_rate": 0.0006586645547845729, "loss": 1.3929, "step": 1205 }, { "epoch": 0.45942097769340295, "grad_norm": 0.7687175874517974, "learning_rate": 0.0006555171252988568, "loss": 1.3733, "step": 1210 }, { "epoch": 0.46131941148552447, "grad_norm": 0.3146929888691999, "learning_rate": 0.0006523628584875507, "loss": 1.3711, "step": 1215 }, { "epoch": 0.4632178452776459, "grad_norm": 0.27183821399250013, "learning_rate": 0.0006492018930283026, "loss": 1.3455, "step": 1220 }, { "epoch": 0.46511627906976744, "grad_norm": 0.31617364418439614, "learning_rate": 0.000646034367893268, "loss": 1.3619, "step": 1225 }, { "epoch": 0.46701471286188895, "grad_norm": 0.3833518263216662, "learning_rate": 0.0006428604223429979, "loss": 1.3452, "step": 1230 }, { "epoch": 0.46891314665401046, "grad_norm": 0.3267810331058041, "learning_rate": 0.0006396801959203186, "loss": 1.3627, "step": 1235 }, { "epoch": 0.4708115804461319, "grad_norm": 0.34041494057204974, "learning_rate": 0.0006364938284441949, "loss": 1.3418, "step": 1240 }, { "epoch": 0.47271001423825343, "grad_norm": 0.42237475709823624, "learning_rate": 0.0006333014600035838, "loss": 1.3594, "step": 1245 }, { "epoch": 0.47460844803037494, "grad_norm": 0.4449414736784289, "learning_rate": 0.0006301032309512754, "loss": 1.3293, "step": 1250 }, { "epoch": 0.47650688182249645, "grad_norm": 0.3046879446109181, "learning_rate": 0.0006268992818977221, "loss": 1.3438, "step": 1255 }, { "epoch": 0.47840531561461797, "grad_norm": 0.3963369420982114, "learning_rate": 0.0006236897537048566, "loss": 1.3418, "step": 1260 }, { "epoch": 0.4803037494067394, "grad_norm": 0.449251264603866, "learning_rate": 0.0006204747874798993, "loss": 1.3443, "step": 1265 }, { "epoch": 0.48220218319886093, "grad_norm": 0.30313300002195165, "learning_rate": 0.0006172545245691538, "loss": 1.3414, "step": 1270 }, { "epoch": 0.48410061699098245, "grad_norm": 0.33417145989869673, "learning_rate": 0.0006140291065517931, "loss": 1.3179, "step": 1275 }, { "epoch": 0.48599905078310396, "grad_norm": 0.3904784303862518, "learning_rate": 0.0006107986752336357, "loss": 1.3411, "step": 1280 }, { "epoch": 0.4878974845752254, "grad_norm": 0.3222639679317221, "learning_rate": 0.0006075633726409091, "loss": 1.3086, "step": 1285 }, { "epoch": 0.4897959183673469, "grad_norm": 0.3288048359935024, "learning_rate": 0.0006043233410140076, "loss": 1.3305, "step": 1290 }, { "epoch": 0.49169435215946844, "grad_norm": 0.2937886282400547, "learning_rate": 0.0006010787228012384, "loss": 1.3029, "step": 1295 }, { "epoch": 0.49359278595158995, "grad_norm": 0.29213405752561167, "learning_rate": 0.0005978296606525572, "loss": 1.3161, "step": 1300 }, { "epoch": 0.49549121974371146, "grad_norm": 0.34322670276805756, "learning_rate": 0.0005945762974132986, "loss": 1.3281, "step": 1305 }, { "epoch": 0.4973896535358329, "grad_norm": 0.28223010113720515, "learning_rate": 0.0005913187761178951, "loss": 1.2943, "step": 1310 }, { "epoch": 0.49928808732795443, "grad_norm": 0.29364231136684454, "learning_rate": 0.0005880572399835881, "loss": 1.2988, "step": 1315 }, { "epoch": 0.5011865211200759, "grad_norm": 0.3396537753038387, "learning_rate": 0.0005847918324041324, "loss": 1.3053, "step": 1320 }, { "epoch": 0.5030849549121974, "grad_norm": 0.3515005890654536, "learning_rate": 0.0005815226969434903, "loss": 1.2881, "step": 1325 }, { "epoch": 0.5049833887043189, "grad_norm": 0.32308285051579544, "learning_rate": 0.0005782499773295219, "loss": 1.31, "step": 1330 }, { "epoch": 0.5068818224964404, "grad_norm": 0.32615560955136413, "learning_rate": 0.0005749738174476639, "loss": 1.2659, "step": 1335 }, { "epoch": 0.5087802562885619, "grad_norm": 0.3566565149320192, "learning_rate": 0.0005716943613346059, "loss": 1.2746, "step": 1340 }, { "epoch": 0.5106786900806835, "grad_norm": 0.4244945485487363, "learning_rate": 0.0005684117531719551, "loss": 1.2741, "step": 1345 }, { "epoch": 0.512577123872805, "grad_norm": 0.31519325513997737, "learning_rate": 0.0005651261372799002, "loss": 1.2773, "step": 1350 }, { "epoch": 0.5144755576649265, "grad_norm": 0.4339192027972204, "learning_rate": 0.0005618376581108647, "loss": 1.2669, "step": 1355 }, { "epoch": 0.516373991457048, "grad_norm": 0.3419108710774503, "learning_rate": 0.0005585464602431556, "loss": 1.2508, "step": 1360 }, { "epoch": 0.5182724252491694, "grad_norm": 0.38978151919816467, "learning_rate": 0.0005552526883746087, "loss": 1.2745, "step": 1365 }, { "epoch": 0.5201708590412909, "grad_norm": 0.4895534717498465, "learning_rate": 0.0005519564873162257, "loss": 1.2461, "step": 1370 }, { "epoch": 0.5220692928334124, "grad_norm": 0.4461111757267344, "learning_rate": 0.0005486580019858075, "loss": 1.2704, "step": 1375 }, { "epoch": 0.5239677266255339, "grad_norm": 0.32031503363419567, "learning_rate": 0.0005453573774015837, "loss": 1.2638, "step": 1380 }, { "epoch": 0.5258661604176554, "grad_norm": 0.3643504769224511, "learning_rate": 0.0005420547586758364, "loss": 1.2595, "step": 1385 }, { "epoch": 0.527764594209777, "grad_norm": 0.32810368902662324, "learning_rate": 0.0005387502910085201, "loss": 1.2326, "step": 1390 }, { "epoch": 0.5296630280018985, "grad_norm": 0.3336242397751466, "learning_rate": 0.0005354441196808778, "loss": 1.2553, "step": 1395 }, { "epoch": 0.53156146179402, "grad_norm": 0.4348846619107913, "learning_rate": 0.000532136390049055, "loss": 1.2457, "step": 1400 }, { "epoch": 0.5334598955861415, "grad_norm": 0.3940477704170248, "learning_rate": 0.0005288272475377078, "loss": 1.2286, "step": 1405 }, { "epoch": 0.5353583293782629, "grad_norm": 0.3034212811156197, "learning_rate": 0.0005255168376336094, "loss": 1.209, "step": 1410 }, { "epoch": 0.5372567631703844, "grad_norm": 0.4307385167019054, "learning_rate": 0.0005222053058792543, "loss": 1.2476, "step": 1415 }, { "epoch": 0.5391551969625059, "grad_norm": 0.34543357053441875, "learning_rate": 0.0005188927978664594, "loss": 1.2149, "step": 1420 }, { "epoch": 0.5410536307546274, "grad_norm": 0.3611941956078568, "learning_rate": 0.0005155794592299626, "loss": 1.2407, "step": 1425 }, { "epoch": 0.5429520645467489, "grad_norm": 0.3375414745847685, "learning_rate": 0.0005122654356410205, "loss": 1.2412, "step": 1430 }, { "epoch": 0.5448504983388704, "grad_norm": 0.286479882843699, "learning_rate": 0.0005089508728010033, "loss": 1.2167, "step": 1435 }, { "epoch": 0.546748932130992, "grad_norm": 0.3385247369838371, "learning_rate": 0.0005056359164349902, "loss": 1.231, "step": 1440 }, { "epoch": 0.5486473659231135, "grad_norm": 0.30665827143092833, "learning_rate": 0.000502320712285361, "loss": 1.1893, "step": 1445 }, { "epoch": 0.550545799715235, "grad_norm": 0.3668246619215131, "learning_rate": 0.0004990054061053896, "loss": 1.1876, "step": 1450 }, { "epoch": 0.5524442335073564, "grad_norm": 0.3886963786863651, "learning_rate": 0.0004956901436528358, "loss": 1.1897, "step": 1455 }, { "epoch": 0.5543426672994779, "grad_norm": 0.33560559480266355, "learning_rate": 0.0004923750706835371, "loss": 1.2048, "step": 1460 }, { "epoch": 0.5562411010915994, "grad_norm": 0.35331444970790565, "learning_rate": 0.0004890603329449997, "loss": 1.1893, "step": 1465 }, { "epoch": 0.5581395348837209, "grad_norm": 0.31797300187844246, "learning_rate": 0.000485746076169992, "loss": 1.1653, "step": 1470 }, { "epoch": 0.5600379686758424, "grad_norm": 0.3697991594252187, "learning_rate": 0.00048243244607013654, "loss": 1.2023, "step": 1475 }, { "epoch": 0.561936402467964, "grad_norm": 0.5326439145387593, "learning_rate": 0.0004791195883295036, "loss": 1.1813, "step": 1480 }, { "epoch": 0.5638348362600855, "grad_norm": 0.371212386610569, "learning_rate": 0.0004758076485982076, "loss": 1.1702, "step": 1485 }, { "epoch": 0.565733270052207, "grad_norm": 0.3968881977650288, "learning_rate": 0.00047249677248600185, "loss": 1.1673, "step": 1490 }, { "epoch": 0.5676317038443285, "grad_norm": 2.9338387571685405, "learning_rate": 0.0004691871055558776, "loss": 1.1908, "step": 1495 }, { "epoch": 0.5695301376364499, "grad_norm": 0.49847319164775866, "learning_rate": 0.00046587879331766457, "loss": 1.1864, "step": 1500 }, { "epoch": 0.5714285714285714, "grad_norm": 0.4078724706306019, "learning_rate": 0.000462571981221633, "loss": 1.1663, "step": 1505 }, { "epoch": 0.5733270052206929, "grad_norm": 0.3228249575712461, "learning_rate": 0.0004592668146520994, "loss": 1.1702, "step": 1510 }, { "epoch": 0.5752254390128144, "grad_norm": 0.37770036266775847, "learning_rate": 0.00045596343892103443, "loss": 1.176, "step": 1515 }, { "epoch": 0.5771238728049359, "grad_norm": 0.3148160726043947, "learning_rate": 0.00045266199926167485, "loss": 1.1747, "step": 1520 }, { "epoch": 0.5790223065970574, "grad_norm": 0.3626958842571658, "learning_rate": 0.00044936264082213724, "loss": 1.143, "step": 1525 }, { "epoch": 0.580920740389179, "grad_norm": 0.3613417080127485, "learning_rate": 0.00044606550865903725, "loss": 1.1523, "step": 1530 }, { "epoch": 0.5828191741813005, "grad_norm": 0.36620946050669057, "learning_rate": 0.0004427707477311123, "loss": 1.1732, "step": 1535 }, { "epoch": 0.584717607973422, "grad_norm": 0.3418418756481974, "learning_rate": 0.000439478502892848, "loss": 1.1616, "step": 1540 }, { "epoch": 0.5866160417655434, "grad_norm": 0.33600686205594626, "learning_rate": 0.0004361889188881102, "loss": 1.1158, "step": 1545 }, { "epoch": 0.5885144755576649, "grad_norm": 0.3764722521246889, "learning_rate": 0.0004329021403437802, "loss": 1.154, "step": 1550 }, { "epoch": 0.5904129093497864, "grad_norm": 0.34240802002457055, "learning_rate": 0.000429618311763398, "loss": 1.139, "step": 1555 }, { "epoch": 0.5923113431419079, "grad_norm": 0.2902338020123924, "learning_rate": 0.00042633757752080727, "loss": 1.134, "step": 1560 }, { "epoch": 0.5942097769340294, "grad_norm": 0.3752689277444726, "learning_rate": 0.00042306008185380927, "loss": 1.1343, "step": 1565 }, { "epoch": 0.5961082107261509, "grad_norm": 0.31329902926972597, "learning_rate": 0.0004197859688578207, "loss": 1.1016, "step": 1570 }, { "epoch": 0.5980066445182725, "grad_norm": 0.3402332223232388, "learning_rate": 0.00041651538247953904, "loss": 1.1052, "step": 1575 }, { "epoch": 0.599905078310394, "grad_norm": 0.3781151786788464, "learning_rate": 0.0004132484665106135, "loss": 1.1191, "step": 1580 }, { "epoch": 0.6018035121025154, "grad_norm": 0.34137800960299325, "learning_rate": 0.0004099853645813235, "loss": 1.0947, "step": 1585 }, { "epoch": 0.6037019458946369, "grad_norm": 0.3136113067101357, "learning_rate": 0.00040672622015426363, "loss": 1.0856, "step": 1590 }, { "epoch": 0.6056003796867584, "grad_norm": 0.3358747164703956, "learning_rate": 0.00040347117651803703, "loss": 1.0948, "step": 1595 }, { "epoch": 0.6074988134788799, "grad_norm": 0.3020729530283223, "learning_rate": 0.00040022037678095454, "loss": 1.0802, "step": 1600 }, { "epoch": 0.6093972472710014, "grad_norm": 0.3904330186630717, "learning_rate": 0.00039697396386474394, "loss": 1.127, "step": 1605 }, { "epoch": 0.6112956810631229, "grad_norm": 0.4027533805204724, "learning_rate": 0.0003937320804982659, "loss": 1.1042, "step": 1610 }, { "epoch": 0.6131941148552444, "grad_norm": 0.3743040482483265, "learning_rate": 0.00039049486921123876, "loss": 1.0907, "step": 1615 }, { "epoch": 0.615092548647366, "grad_norm": 0.38060393763386724, "learning_rate": 0.000387262472327973, "loss": 1.1051, "step": 1620 }, { "epoch": 0.6169909824394875, "grad_norm": 0.34005085980267563, "learning_rate": 0.00038403503196111265, "loss": 1.0647, "step": 1625 }, { "epoch": 0.6188894162316089, "grad_norm": 0.30724882728663455, "learning_rate": 0.0003808126900053887, "loss": 1.0771, "step": 1630 }, { "epoch": 0.6207878500237304, "grad_norm": 0.306237705852109, "learning_rate": 0.0003775955881313797, "loss": 1.0831, "step": 1635 }, { "epoch": 0.6226862838158519, "grad_norm": 0.4486821484171895, "learning_rate": 0.0003743838677792833, "loss": 1.0767, "step": 1640 }, { "epoch": 0.6245847176079734, "grad_norm": 0.35675874358918835, "learning_rate": 0.0003711776701526982, "loss": 1.0841, "step": 1645 }, { "epoch": 0.6264831514000949, "grad_norm": 0.31338528181698055, "learning_rate": 0.00036797713621241615, "loss": 1.0851, "step": 1650 }, { "epoch": 0.6283815851922164, "grad_norm": 0.3687470434111214, "learning_rate": 0.000364782406670224, "loss": 1.0707, "step": 1655 }, { "epoch": 0.6302800189843379, "grad_norm": 0.31784199572514304, "learning_rate": 0.0003615936219827176, "loss": 1.0606, "step": 1660 }, { "epoch": 0.6321784527764595, "grad_norm": 0.42249881083466695, "learning_rate": 0.00035841092234512723, "loss": 1.0725, "step": 1665 }, { "epoch": 0.634076886568581, "grad_norm": 0.3459092803870884, "learning_rate": 0.0003552344476851531, "loss": 1.0574, "step": 1670 }, { "epoch": 0.6359753203607024, "grad_norm": 0.31341808911866365, "learning_rate": 0.00035206433765681334, "loss": 1.0399, "step": 1675 }, { "epoch": 0.6378737541528239, "grad_norm": 0.3619705130805933, "learning_rate": 0.00034890073163430503, "loss": 1.0295, "step": 1680 }, { "epoch": 0.6397721879449454, "grad_norm": 0.302930290181787, "learning_rate": 0.00034574376870587535, "loss": 1.0302, "step": 1685 }, { "epoch": 0.6416706217370669, "grad_norm": 0.32319117475207404, "learning_rate": 0.00034259358766770766, "loss": 1.0457, "step": 1690 }, { "epoch": 0.6435690555291884, "grad_norm": 0.35131830575911627, "learning_rate": 0.0003394503270178185, "loss": 1.0527, "step": 1695 }, { "epoch": 0.6454674893213099, "grad_norm": 0.35565194703093944, "learning_rate": 0.0003363141249499696, "loss": 1.0271, "step": 1700 }, { "epoch": 0.6473659231134314, "grad_norm": 0.3631695241713009, "learning_rate": 0.00033318511934759046, "loss": 1.044, "step": 1705 }, { "epoch": 0.649264356905553, "grad_norm": 0.3236231297783939, "learning_rate": 0.0003300634477777179, "loss": 1.0102, "step": 1710 }, { "epoch": 0.6511627906976745, "grad_norm": 0.3213681061725248, "learning_rate": 0.00032694924748494713, "loss": 1.0083, "step": 1715 }, { "epoch": 0.6530612244897959, "grad_norm": 0.6068864104166338, "learning_rate": 0.00032384265538539783, "loss": 0.9907, "step": 1720 }, { "epoch": 0.6549596582819174, "grad_norm": 0.41210521486813323, "learning_rate": 0.0003207438080606949, "loss": 1.0233, "step": 1725 }, { "epoch": 0.6568580920740389, "grad_norm": 0.46261139879081853, "learning_rate": 0.00031765284175196324, "loss": 1.0167, "step": 1730 }, { "epoch": 0.6587565258661604, "grad_norm": 0.3436327494287044, "learning_rate": 0.0003145698923538384, "loss": 1.0282, "step": 1735 }, { "epoch": 0.6606549596582819, "grad_norm": 0.35619308018504636, "learning_rate": 0.00031149509540849156, "loss": 0.99, "step": 1740 }, { "epoch": 0.6625533934504034, "grad_norm": 0.34309970855605526, "learning_rate": 0.0003084285860996704, "loss": 1.015, "step": 1745 }, { "epoch": 0.6644518272425249, "grad_norm": 0.393508338674986, "learning_rate": 0.0003053704992467558, "loss": 1.0116, "step": 1750 }, { "epoch": 0.6663502610346465, "grad_norm": 0.3397492909000668, "learning_rate": 0.0003023209692988349, "loss": 0.9983, "step": 1755 }, { "epoch": 0.668248694826768, "grad_norm": 0.30189558040021247, "learning_rate": 0.0002992801303287892, "loss": 1.0073, "step": 1760 }, { "epoch": 0.6701471286188894, "grad_norm": 0.33390373051452854, "learning_rate": 0.00029624811602740105, "loss": 0.9931, "step": 1765 }, { "epoch": 0.6720455624110109, "grad_norm": 0.3565911003751968, "learning_rate": 0.0002932250596974747, "loss": 0.9794, "step": 1770 }, { "epoch": 0.6739439962031324, "grad_norm": 0.30184049203060254, "learning_rate": 0.00029021109424797706, "loss": 0.9853, "step": 1775 }, { "epoch": 0.6758424299952539, "grad_norm": 0.2996692988493312, "learning_rate": 0.00028720635218819313, "loss": 0.9716, "step": 1780 }, { "epoch": 0.6777408637873754, "grad_norm": 0.3158967543323723, "learning_rate": 0.00028421096562190087, "loss": 0.971, "step": 1785 }, { "epoch": 0.6796392975794969, "grad_norm": 0.3259541143986902, "learning_rate": 0.00028122506624156287, "loss": 0.9667, "step": 1790 }, { "epoch": 0.6815377313716184, "grad_norm": 0.3050723057723714, "learning_rate": 0.00027824878532253675, "loss": 0.9698, "step": 1795 }, { "epoch": 0.68343616516374, "grad_norm": 0.3225081325517291, "learning_rate": 0.0002752822537173033, "loss": 0.9627, "step": 1800 }, { "epoch": 0.6853345989558615, "grad_norm": 0.3337812239040389, "learning_rate": 0.00027232560184971434, "loss": 0.9673, "step": 1805 }, { "epoch": 0.6872330327479829, "grad_norm": 0.4504964673114019, "learning_rate": 0.00026937895970925794, "loss": 0.9504, "step": 1810 }, { "epoch": 0.6891314665401044, "grad_norm": 0.6271532351806949, "learning_rate": 0.00026644245684534317, "loss": 0.9655, "step": 1815 }, { "epoch": 0.6910299003322259, "grad_norm": 0.42681861959807327, "learning_rate": 0.00026351622236160487, "loss": 0.9418, "step": 1820 }, { "epoch": 0.6929283341243474, "grad_norm": 0.4353835236159284, "learning_rate": 0.00026060038491022787, "loss": 0.9777, "step": 1825 }, { "epoch": 0.6948267679164689, "grad_norm": 0.3479809339329037, "learning_rate": 0.00025769507268628993, "loss": 0.9721, "step": 1830 }, { "epoch": 0.6967252017085904, "grad_norm": 0.31070434225155574, "learning_rate": 0.00025480041342212695, "loss": 0.9553, "step": 1835 }, { "epoch": 0.6986236355007119, "grad_norm": 0.3849172751179302, "learning_rate": 0.00025191653438171545, "loss": 0.9461, "step": 1840 }, { "epoch": 0.7005220692928334, "grad_norm": 0.4264474066939237, "learning_rate": 0.00024904356235507945, "loss": 0.9467, "step": 1845 }, { "epoch": 0.702420503084955, "grad_norm": 0.3548766629306175, "learning_rate": 0.0002461816236527141, "loss": 0.947, "step": 1850 }, { "epoch": 0.7043189368770764, "grad_norm": 0.3514101652953766, "learning_rate": 0.0002433308441000338, "loss": 0.9268, "step": 1855 }, { "epoch": 0.7062173706691979, "grad_norm": 0.380453955195146, "learning_rate": 0.00024049134903183955, "loss": 0.9293, "step": 1860 }, { "epoch": 0.7081158044613194, "grad_norm": 0.35298900401975053, "learning_rate": 0.00023766326328680958, "loss": 0.9348, "step": 1865 }, { "epoch": 0.7100142382534409, "grad_norm": 0.42441310564082074, "learning_rate": 0.00023484671120200935, "loss": 0.9276, "step": 1870 }, { "epoch": 0.7119126720455624, "grad_norm": 0.319038899787948, "learning_rate": 0.00023204181660742602, "loss": 0.9431, "step": 1875 }, { "epoch": 0.7138111058376839, "grad_norm": 0.4049851574728775, "learning_rate": 0.00022924870282052445, "loss": 0.906, "step": 1880 }, { "epoch": 0.7157095396298054, "grad_norm": 0.3513877016599214, "learning_rate": 0.00022646749264082478, "loss": 0.9095, "step": 1885 }, { "epoch": 0.717607973421927, "grad_norm": 0.4081310968482307, "learning_rate": 0.00022369830834450367, "loss": 0.9003, "step": 1890 }, { "epoch": 0.7195064072140485, "grad_norm": 0.35805106514821416, "learning_rate": 0.00022094127167901934, "loss": 0.9099, "step": 1895 }, { "epoch": 0.7214048410061699, "grad_norm": 0.3631465368774142, "learning_rate": 0.0002181965038577577, "loss": 0.9092, "step": 1900 }, { "epoch": 0.7233032747982914, "grad_norm": 0.3650905218048075, "learning_rate": 0.0002154641255547038, "loss": 0.9252, "step": 1905 }, { "epoch": 0.7252017085904129, "grad_norm": 0.31898731584804263, "learning_rate": 0.00021274425689913617, "loss": 0.9071, "step": 1910 }, { "epoch": 0.7271001423825344, "grad_norm": 0.31603070803205635, "learning_rate": 0.00021003701747034616, "loss": 0.9165, "step": 1915 }, { "epoch": 0.7289985761746559, "grad_norm": 0.30904969427001383, "learning_rate": 0.00020734252629237893, "loss": 0.8971, "step": 1920 }, { "epoch": 0.7308970099667774, "grad_norm": 0.32433517710502985, "learning_rate": 0.00020466090182880248, "loss": 0.8908, "step": 1925 }, { "epoch": 0.7327954437588989, "grad_norm": 0.3168746074632352, "learning_rate": 0.00020199226197749792, "loss": 0.9011, "step": 1930 }, { "epoch": 0.7346938775510204, "grad_norm": 0.31832881987489503, "learning_rate": 0.00019933672406547665, "loss": 0.9138, "step": 1935 }, { "epoch": 0.736592311343142, "grad_norm": 0.301051575919513, "learning_rate": 0.00019669440484372213, "loss": 0.8897, "step": 1940 }, { "epoch": 0.7384907451352634, "grad_norm": 0.33220133814920705, "learning_rate": 0.00019406542048205666, "loss": 0.8868, "step": 1945 }, { "epoch": 0.7403891789273849, "grad_norm": 0.31331199745305405, "learning_rate": 0.0001914498865640344, "loss": 0.8921, "step": 1950 }, { "epoch": 0.7422876127195064, "grad_norm": 0.3150890748673222, "learning_rate": 0.00018884791808185947, "loss": 0.8925, "step": 1955 }, { "epoch": 0.7441860465116279, "grad_norm": 0.31284305447491684, "learning_rate": 0.0001862596294313299, "loss": 0.8779, "step": 1960 }, { "epoch": 0.7460844803037494, "grad_norm": 0.33272829874860427, "learning_rate": 0.00018368513440680884, "loss": 0.8799, "step": 1965 }, { "epoch": 0.7479829140958709, "grad_norm": 0.32686823493529116, "learning_rate": 0.0001811245461962212, "loss": 0.8837, "step": 1970 }, { "epoch": 0.7498813478879924, "grad_norm": 0.32129617206658423, "learning_rate": 0.0001785779773760775, "loss": 0.891, "step": 1975 }, { "epoch": 0.751779781680114, "grad_norm": 0.29507710361264766, "learning_rate": 0.0001760455399065246, "loss": 0.87, "step": 1980 }, { "epoch": 0.7536782154722355, "grad_norm": 0.3392524599510225, "learning_rate": 0.00017352734512642276, "loss": 0.8614, "step": 1985 }, { "epoch": 0.7555766492643569, "grad_norm": 0.34922494756037853, "learning_rate": 0.00017102350374845155, "loss": 0.8739, "step": 1990 }, { "epoch": 0.7574750830564784, "grad_norm": 0.38856477147363533, "learning_rate": 0.00016853412585424128, "loss": 0.8608, "step": 1995 }, { "epoch": 0.7593735168485999, "grad_norm": 0.3667333097382861, "learning_rate": 0.00016605932088953397, "loss": 0.8524, "step": 2000 }, { "epoch": 0.7612719506407214, "grad_norm": 0.4112130915608781, "learning_rate": 0.00016359919765937149, "loss": 0.851, "step": 2005 }, { "epoch": 0.7631703844328429, "grad_norm": 0.35792666839319404, "learning_rate": 0.00016115386432331147, "loss": 0.8444, "step": 2010 }, { "epoch": 0.7650688182249644, "grad_norm": 0.35486084734332507, "learning_rate": 0.00015872342839067305, "loss": 0.8561, "step": 2015 }, { "epoch": 0.7669672520170859, "grad_norm": 0.34325463219420715, "learning_rate": 0.0001563079967158088, "loss": 0.8549, "step": 2020 }, { "epoch": 0.7688656858092074, "grad_norm": 0.33043895406245294, "learning_rate": 0.0001539076754934084, "loss": 0.8576, "step": 2025 }, { "epoch": 0.770764119601329, "grad_norm": 0.30878788182142447, "learning_rate": 0.00015152257025382844, "loss": 0.8559, "step": 2030 }, { "epoch": 0.7726625533934504, "grad_norm": 0.35444741515541456, "learning_rate": 0.00014915278585845348, "loss": 0.8415, "step": 2035 }, { "epoch": 0.7745609871855719, "grad_norm": 0.30847085513086014, "learning_rate": 0.00014679842649508568, "loss": 0.8259, "step": 2040 }, { "epoch": 0.7764594209776934, "grad_norm": 0.31252404771224196, "learning_rate": 0.00014445959567336441, "loss": 0.859, "step": 2045 }, { "epoch": 0.7783578547698149, "grad_norm": 0.3240169921265319, "learning_rate": 0.0001421363962202149, "loss": 0.8301, "step": 2050 }, { "epoch": 0.7802562885619364, "grad_norm": 0.32322492788162155, "learning_rate": 0.00013982893027532757, "loss": 0.822, "step": 2055 }, { "epoch": 0.7821547223540579, "grad_norm": 0.3516161817741868, "learning_rate": 0.00013753729928666825, "loss": 0.8374, "step": 2060 }, { "epoch": 0.7840531561461794, "grad_norm": 0.31124890284462753, "learning_rate": 0.00013526160400601682, "loss": 0.8293, "step": 2065 }, { "epoch": 0.7859515899383009, "grad_norm": 0.307376831052294, "learning_rate": 0.00013300194448453818, "loss": 0.8314, "step": 2070 }, { "epoch": 0.7878500237304225, "grad_norm": 0.33386681520064726, "learning_rate": 0.00013075842006838407, "loss": 0.8485, "step": 2075 }, { "epoch": 0.7897484575225439, "grad_norm": 0.3223752393505699, "learning_rate": 0.0001285311293943241, "loss": 0.8424, "step": 2080 }, { "epoch": 0.7916468913146654, "grad_norm": 0.3575122610804609, "learning_rate": 0.00012632017038541026, "loss": 0.8326, "step": 2085 }, { "epoch": 0.7935453251067869, "grad_norm": 0.32346828366821057, "learning_rate": 0.0001241256402466709, "loss": 0.8422, "step": 2090 }, { "epoch": 0.7954437588989084, "grad_norm": 0.3341501772850559, "learning_rate": 0.00012194763546083803, "loss": 0.8155, "step": 2095 }, { "epoch": 0.7973421926910299, "grad_norm": 0.33689872208599597, "learning_rate": 0.00011978625178410434, "loss": 0.8179, "step": 2100 }, { "epoch": 0.7992406264831514, "grad_norm": 0.3640068937736561, "learning_rate": 0.00011764158424191435, "loss": 0.8107, "step": 2105 }, { "epoch": 0.8011390602752729, "grad_norm": 0.31299920707259116, "learning_rate": 0.00011551372712478575, "loss": 0.8013, "step": 2110 }, { "epoch": 0.8030374940673944, "grad_norm": 0.3184880942321623, "learning_rate": 0.0001134027739841642, "loss": 0.8012, "step": 2115 }, { "epoch": 0.804935927859516, "grad_norm": 0.3162681368592125, "learning_rate": 0.00011130881762831069, "loss": 0.8156, "step": 2120 }, { "epoch": 0.8068343616516374, "grad_norm": 0.3211345805440541, "learning_rate": 0.00010923195011822058, "loss": 0.8051, "step": 2125 }, { "epoch": 0.8087327954437589, "grad_norm": 0.3249567779384397, "learning_rate": 0.00010717226276357667, "loss": 0.8024, "step": 2130 }, { "epoch": 0.8106312292358804, "grad_norm": 0.31173448153562644, "learning_rate": 0.00010512984611873466, "loss": 0.8028, "step": 2135 }, { "epoch": 0.8125296630280019, "grad_norm": 0.34264672110601546, "learning_rate": 0.00010310478997874162, "loss": 0.7893, "step": 2140 }, { "epoch": 0.8144280968201234, "grad_norm": 0.36225713373489543, "learning_rate": 0.0001010971833753882, "loss": 0.7941, "step": 2145 }, { "epoch": 0.8163265306122449, "grad_norm": 0.39829779195110515, "learning_rate": 9.910711457329479e-05, "loss": 0.784, "step": 2150 }, { "epoch": 0.8182249644043664, "grad_norm": 0.4154842554468083, "learning_rate": 9.713467106603024e-05, "loss": 0.7973, "step": 2155 }, { "epoch": 0.8201233981964879, "grad_norm": 0.35334120375262246, "learning_rate": 9.517993957226612e-05, "loss": 0.7887, "step": 2160 }, { "epoch": 0.8220218319886095, "grad_norm": 0.342997739965322, "learning_rate": 9.3243006031963e-05, "loss": 0.7884, "step": 2165 }, { "epoch": 0.8239202657807309, "grad_norm": 0.345714194018433, "learning_rate": 9.132395560259337e-05, "loss": 0.7894, "step": 2170 }, { "epoch": 0.8258186995728524, "grad_norm": 0.34821270209325056, "learning_rate": 8.942287265539639e-05, "loss": 0.796, "step": 2175 }, { "epoch": 0.8277171333649739, "grad_norm": 0.32502745681436357, "learning_rate": 8.753984077166937e-05, "loss": 0.8015, "step": 2180 }, { "epoch": 0.8296155671570954, "grad_norm": 0.388775004922499, "learning_rate": 8.567494273909277e-05, "loss": 0.8025, "step": 2185 }, { "epoch": 0.8315140009492169, "grad_norm": 0.3127036551166177, "learning_rate": 8.382826054809079e-05, "loss": 0.7895, "step": 2190 }, { "epoch": 0.8334124347413384, "grad_norm": 0.29839600378854597, "learning_rate": 8.1999875388226e-05, "loss": 0.7818, "step": 2195 }, { "epoch": 0.8353108685334599, "grad_norm": 0.31004929243565, "learning_rate": 8.018986764463032e-05, "loss": 0.7881, "step": 2200 }, { "epoch": 0.8372093023255814, "grad_norm": 0.32248432401397453, "learning_rate": 7.839831689447102e-05, "loss": 0.7961, "step": 2205 }, { "epoch": 0.8391077361177028, "grad_norm": 0.32300916569046356, "learning_rate": 7.662530190345157e-05, "loss": 0.7859, "step": 2210 }, { "epoch": 0.8410061699098244, "grad_norm": 0.3437848320718501, "learning_rate": 7.487090062234898e-05, "loss": 0.7706, "step": 2215 }, { "epoch": 0.8429046037019459, "grad_norm": 0.3104343754734283, "learning_rate": 7.313519018358695e-05, "loss": 0.7715, "step": 2220 }, { "epoch": 0.8448030374940674, "grad_norm": 0.3171733947505178, "learning_rate": 7.141824689784421e-05, "loss": 0.7487, "step": 2225 }, { "epoch": 0.8467014712861889, "grad_norm": 0.3114906950878784, "learning_rate": 6.972014625069984e-05, "loss": 0.7557, "step": 2230 }, { "epoch": 0.8485999050783104, "grad_norm": 0.3427221492524242, "learning_rate": 6.804096289931443e-05, "loss": 0.7505, "step": 2235 }, { "epoch": 0.8504983388704319, "grad_norm": 0.3257908676508504, "learning_rate": 6.638077066914811e-05, "loss": 0.7691, "step": 2240 }, { "epoch": 0.8523967726625534, "grad_norm": 0.3245771425045967, "learning_rate": 6.473964255071418e-05, "loss": 0.7712, "step": 2245 }, { "epoch": 0.8542952064546749, "grad_norm": 0.30642317659601426, "learning_rate": 6.311765069637037e-05, "loss": 0.7508, "step": 2250 }, { "epoch": 0.8561936402467963, "grad_norm": 0.35717316205581434, "learning_rate": 6.151486641714705e-05, "loss": 0.7585, "step": 2255 }, { "epoch": 0.8580920740389179, "grad_norm": 0.29834051216293683, "learning_rate": 5.993136017961143e-05, "loss": 0.7564, "step": 2260 }, { "epoch": 0.8599905078310394, "grad_norm": 0.31967989226553994, "learning_rate": 5.83672016027697e-05, "loss": 0.7402, "step": 2265 }, { "epoch": 0.8618889416231609, "grad_norm": 0.3316739821901055, "learning_rate": 5.6822459455006246e-05, "loss": 0.7617, "step": 2270 }, { "epoch": 0.8637873754152824, "grad_norm": 0.33111810076533643, "learning_rate": 5.529720165106056e-05, "loss": 0.7702, "step": 2275 }, { "epoch": 0.8656858092074039, "grad_norm": 0.31953471953308643, "learning_rate": 5.3791495249040644e-05, "loss": 0.7585, "step": 2280 }, { "epoch": 0.8675842429995254, "grad_norm": 0.34435784933473257, "learning_rate": 5.2305406447475504e-05, "loss": 0.7539, "step": 2285 }, { "epoch": 0.8694826767916469, "grad_norm": 0.33531663584742627, "learning_rate": 5.083900058240437e-05, "loss": 0.7574, "step": 2290 }, { "epoch": 0.8713811105837684, "grad_norm": 0.32627115970227594, "learning_rate": 4.939234212450405e-05, "loss": 0.7502, "step": 2295 }, { "epoch": 0.8732795443758898, "grad_norm": 0.34710941754901015, "learning_rate": 4.796549467625494e-05, "loss": 0.7517, "step": 2300 }, { "epoch": 0.8751779781680114, "grad_norm": 0.3259737091035039, "learning_rate": 4.6558520969144205e-05, "loss": 0.7531, "step": 2305 }, { "epoch": 0.8770764119601329, "grad_norm": 0.3543527295348622, "learning_rate": 4.517148286090822e-05, "loss": 0.7395, "step": 2310 }, { "epoch": 0.8789748457522544, "grad_norm": 0.3411443809878121, "learning_rate": 4.3804441332812915e-05, "loss": 0.7493, "step": 2315 }, { "epoch": 0.8808732795443759, "grad_norm": 0.323002963765091, "learning_rate": 4.245745648697241e-05, "loss": 0.7562, "step": 2320 }, { "epoch": 0.8827717133364974, "grad_norm": 0.3369758289265976, "learning_rate": 4.1130587543706796e-05, "loss": 0.7466, "step": 2325 }, { "epoch": 0.8846701471286189, "grad_norm": 0.3152326006804494, "learning_rate": 3.982389283893878e-05, "loss": 0.7458, "step": 2330 }, { "epoch": 0.8865685809207404, "grad_norm": 0.3504098491203572, "learning_rate": 3.853742982162839e-05, "loss": 0.7503, "step": 2335 }, { "epoch": 0.8884670147128619, "grad_norm": 0.33687850279262355, "learning_rate": 3.72712550512479e-05, "loss": 0.7651, "step": 2340 }, { "epoch": 0.8903654485049833, "grad_norm": 0.3386005467292844, "learning_rate": 3.602542419529453e-05, "loss": 0.766, "step": 2345 }, { "epoch": 0.8922638822971048, "grad_norm": 0.3110137804067388, "learning_rate": 3.479999202684353e-05, "loss": 0.7397, "step": 2350 }, { "epoch": 0.8941623160892264, "grad_norm": 0.32583342601441784, "learning_rate": 3.359501242213981e-05, "loss": 0.7346, "step": 2355 }, { "epoch": 0.8960607498813479, "grad_norm": 0.3187549346009566, "learning_rate": 3.24105383582291e-05, "loss": 0.7492, "step": 2360 }, { "epoch": 0.8979591836734694, "grad_norm": 0.3218971294090315, "learning_rate": 3.1246621910629323e-05, "loss": 0.7504, "step": 2365 }, { "epoch": 0.8998576174655909, "grad_norm": 0.3509832182230347, "learning_rate": 3.0103314251040683e-05, "loss": 0.7474, "step": 2370 }, { "epoch": 0.9017560512577124, "grad_norm": 0.3045155962925135, "learning_rate": 2.8980665645095993e-05, "loss": 0.7457, "step": 2375 }, { "epoch": 0.9036544850498339, "grad_norm": 0.33768892145147017, "learning_rate": 2.787872545015069e-05, "loss": 0.7369, "step": 2380 }, { "epoch": 0.9055529188419554, "grad_norm": 0.30591307319847055, "learning_rate": 2.679754211311314e-05, "loss": 0.7349, "step": 2385 }, { "epoch": 0.9074513526340768, "grad_norm": 0.31814695669489196, "learning_rate": 2.5737163168314093e-05, "loss": 0.7465, "step": 2390 }, { "epoch": 0.9093497864261983, "grad_norm": 0.33557857522275575, "learning_rate": 2.4697635235417403e-05, "loss": 0.7304, "step": 2395 }, { "epoch": 0.9112482202183199, "grad_norm": 0.31077407798760204, "learning_rate": 2.3679004017370165e-05, "loss": 0.7322, "step": 2400 }, { "epoch": 0.9131466540104414, "grad_norm": 0.3696076542535449, "learning_rate": 2.2681314298393208e-05, "loss": 0.7415, "step": 2405 }, { "epoch": 0.9150450878025629, "grad_norm": 0.34756329848102396, "learning_rate": 2.1704609942012344e-05, "loss": 0.7437, "step": 2410 }, { "epoch": 0.9169435215946844, "grad_norm": 0.3105145840944022, "learning_rate": 2.074893388912996e-05, "loss": 0.7261, "step": 2415 }, { "epoch": 0.9188419553868059, "grad_norm": 0.3385109368325919, "learning_rate": 1.9814328156136986e-05, "loss": 0.7507, "step": 2420 }, { "epoch": 0.9207403891789274, "grad_norm": 0.30326901789443217, "learning_rate": 1.8900833833065622e-05, "loss": 0.7368, "step": 2425 }, { "epoch": 0.9226388229710489, "grad_norm": 0.31849955729705076, "learning_rate": 1.800849108178304e-05, "loss": 0.732, "step": 2430 }, { "epoch": 0.9245372567631703, "grad_norm": 0.3023724936604926, "learning_rate": 1.7137339134225326e-05, "loss": 0.7395, "step": 2435 }, { "epoch": 0.9264356905552918, "grad_norm": 0.3045797697728719, "learning_rate": 1.628741629067282e-05, "loss": 0.7372, "step": 2440 }, { "epoch": 0.9283341243474134, "grad_norm": 0.3072440159247055, "learning_rate": 1.5458759918066333e-05, "loss": 0.7271, "step": 2445 }, { "epoch": 0.9302325581395349, "grad_norm": 0.2973843731076206, "learning_rate": 1.4651406448364046e-05, "loss": 0.7286, "step": 2450 }, { "epoch": 0.9321309919316564, "grad_norm": 0.33315733476007137, "learning_rate": 1.3865391376940151e-05, "loss": 0.7245, "step": 2455 }, { "epoch": 0.9340294257237779, "grad_norm": 0.35918744134846453, "learning_rate": 1.3100749261024003e-05, "loss": 0.7233, "step": 2460 }, { "epoch": 0.9359278595158994, "grad_norm": 0.2959597147751244, "learning_rate": 1.2357513718180724e-05, "loss": 0.7406, "step": 2465 }, { "epoch": 0.9378262933080209, "grad_norm": 0.6025608542347619, "learning_rate": 1.1635717424833602e-05, "loss": 0.7319, "step": 2470 }, { "epoch": 0.9397247271001424, "grad_norm": 0.31728158924540384, "learning_rate": 1.0935392114827026e-05, "loss": 0.7142, "step": 2475 }, { "epoch": 0.9416231608922638, "grad_norm": 0.3202471419277355, "learning_rate": 1.0256568578031533e-05, "loss": 0.7234, "step": 2480 }, { "epoch": 0.9435215946843853, "grad_norm": 0.33730636346394754, "learning_rate": 9.599276658990353e-06, "loss": 0.7266, "step": 2485 }, { "epoch": 0.9454200284765069, "grad_norm": 0.3105080016851193, "learning_rate": 8.963545255606664e-06, "loss": 0.7309, "step": 2490 }, { "epoch": 0.9473184622686284, "grad_norm": 0.31936758390953845, "learning_rate": 8.349402317873788e-06, "loss": 0.7385, "step": 2495 }, { "epoch": 0.9492168960607499, "grad_norm": 0.35011965244445487, "learning_rate": 7.756874846645834e-06, "loss": 0.7298, "step": 2500 }, { "epoch": 0.9511153298528714, "grad_norm": 0.3189446744264921, "learning_rate": 7.185988892450923e-06, "loss": 0.7345, "step": 2505 }, { "epoch": 0.9530137636449929, "grad_norm": 0.3463481807220011, "learning_rate": 6.636769554345778e-06, "loss": 0.7296, "step": 2510 }, { "epoch": 0.9549121974371144, "grad_norm": 0.3144784342698344, "learning_rate": 6.109240978812047e-06, "loss": 0.733, "step": 2515 }, { "epoch": 0.9568106312292359, "grad_norm": 0.32358238607927053, "learning_rate": 5.603426358695207e-06, "loss": 0.7227, "step": 2520 }, { "epoch": 0.9587090650213573, "grad_norm": 0.3070502390360057, "learning_rate": 5.11934793218427e-06, "loss": 0.7379, "step": 2525 }, { "epoch": 0.9606074988134788, "grad_norm": 0.30930683987481156, "learning_rate": 4.657026981834622e-06, "loss": 0.7244, "step": 2530 }, { "epoch": 0.9625059326056004, "grad_norm": 0.3184603698723896, "learning_rate": 4.216483833631879e-06, "loss": 0.7374, "step": 2535 }, { "epoch": 0.9644043663977219, "grad_norm": 0.313081890733716, "learning_rate": 3.7977378560985487e-06, "loss": 0.7253, "step": 2540 }, { "epoch": 0.9663028001898434, "grad_norm": 0.33414880766899385, "learning_rate": 3.4008074594423233e-06, "loss": 0.7246, "step": 2545 }, { "epoch": 0.9682012339819649, "grad_norm": 0.30264216877470684, "learning_rate": 3.0257100947470027e-06, "loss": 0.7274, "step": 2550 }, { "epoch": 0.9700996677740864, "grad_norm": 0.34059297981030995, "learning_rate": 2.672462253204666e-06, "loss": 0.7429, "step": 2555 }, { "epoch": 0.9719981015662079, "grad_norm": 0.3255229667279891, "learning_rate": 2.3410794653911936e-06, "loss": 0.7246, "step": 2560 }, { "epoch": 0.9738965353583294, "grad_norm": 0.32454959915539006, "learning_rate": 2.03157630058326e-06, "loss": 0.7202, "step": 2565 }, { "epoch": 0.9757949691504508, "grad_norm": 0.3205716386887886, "learning_rate": 1.7439663661176219e-06, "loss": 0.7363, "step": 2570 }, { "epoch": 0.9776934029425723, "grad_norm": 0.29763916460228407, "learning_rate": 1.478262306793099e-06, "loss": 0.7319, "step": 2575 }, { "epoch": 0.9795918367346939, "grad_norm": 0.3316371045620813, "learning_rate": 1.234475804314683e-06, "loss": 0.7519, "step": 2580 }, { "epoch": 0.9814902705268154, "grad_norm": 0.3105059150408721, "learning_rate": 1.012617576779673e-06, "loss": 0.7261, "step": 2585 }, { "epoch": 0.9833887043189369, "grad_norm": 0.30466660031856074, "learning_rate": 8.126973782067171e-07, "loss": 0.7321, "step": 2590 }, { "epoch": 0.9852871381110584, "grad_norm": 0.3373827673378842, "learning_rate": 6.347239981068231e-07, "loss": 0.7395, "step": 2595 }, { "epoch": 0.9871855719031799, "grad_norm": 0.31614437085864816, "learning_rate": 4.787052610970566e-07, "loss": 0.7253, "step": 2600 }, { "epoch": 0.9890840056953014, "grad_norm": 0.3300895421062348, "learning_rate": 3.446480265563712e-07, "loss": 0.7329, "step": 2605 }, { "epoch": 0.9909824394874229, "grad_norm": 0.33291802253713565, "learning_rate": 2.3255818832423892e-07, "loss": 0.7245, "step": 2610 }, { "epoch": 0.9928808732795443, "grad_norm": 0.32675535698611996, "learning_rate": 1.4244067444124652e-07, "loss": 0.7218, "step": 2615 }, { "epoch": 0.9947793070716658, "grad_norm": 0.32024826764191183, "learning_rate": 7.429944693276847e-08, "loss": 0.736, "step": 2620 }, { "epoch": 0.9966777408637874, "grad_norm": 0.3176106426665133, "learning_rate": 2.8137501634439844e-08, "loss": 0.7406, "step": 2625 }, { "epoch": 0.9985761746559089, "grad_norm": 0.3213640893094606, "learning_rate": 3.956868060761565e-09, "loss": 0.7448, "step": 2630 }, { "epoch": 0.9997152349311818, "eval_loss": 2.2808492183685303, "eval_runtime": 8.5401, "eval_samples_per_second": 46.018, "eval_steps_per_second": 11.592, "step": 2633 }, { "epoch": 0.9997152349311818, "step": 2633, "total_flos": 32299470028800.0, "train_loss": 1.3562805264158209, "train_runtime": 3817.2623, "train_samples_per_second": 11.039, "train_steps_per_second": 0.69 } ], "logging_steps": 5, "max_steps": 2633, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 32299470028800.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }