{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 14950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006688963210702341, "grad_norm": 5.731072425842285, "learning_rate": 9.993311036789297e-05, "loss": 0.6793, "step": 10 }, { "epoch": 0.013377926421404682, "grad_norm": 9.623015403747559, "learning_rate": 9.986622073578595e-05, "loss": 0.1186, "step": 20 }, { "epoch": 0.020066889632107024, "grad_norm": 13.354948043823242, "learning_rate": 9.979933110367894e-05, "loss": 0.1486, "step": 30 }, { "epoch": 0.026755852842809364, "grad_norm": 5.970050811767578, "learning_rate": 9.973244147157192e-05, "loss": 0.1379, "step": 40 }, { "epoch": 0.033444816053511704, "grad_norm": 6.1483354568481445, "learning_rate": 9.966555183946489e-05, "loss": 0.0901, "step": 50 }, { "epoch": 0.04013377926421405, "grad_norm": 1.7410144805908203, "learning_rate": 9.959866220735787e-05, "loss": 0.1011, "step": 60 }, { "epoch": 0.046822742474916385, "grad_norm": 10.210768699645996, "learning_rate": 9.953177257525084e-05, "loss": 0.1327, "step": 70 }, { "epoch": 0.05351170568561873, "grad_norm": 8.604742050170898, "learning_rate": 9.946488294314382e-05, "loss": 0.1222, "step": 80 }, { "epoch": 0.06020066889632107, "grad_norm": 8.2460355758667, "learning_rate": 9.93979933110368e-05, "loss": 0.12, "step": 90 }, { "epoch": 0.06688963210702341, "grad_norm": 4.913175582885742, "learning_rate": 9.933110367892977e-05, "loss": 0.1165, "step": 100 }, { "epoch": 0.07357859531772576, "grad_norm": 3.4663991928100586, "learning_rate": 9.926421404682275e-05, "loss": 0.1905, "step": 110 }, { "epoch": 0.0802675585284281, "grad_norm": 3.857680082321167, "learning_rate": 9.919732441471572e-05, "loss": 0.0881, "step": 120 }, { "epoch": 0.08695652173913043, "grad_norm": 10.378622055053711, "learning_rate": 9.91304347826087e-05, "loss": 0.0997, "step": 130 }, { "epoch": 0.09364548494983277, "grad_norm": 1.7943992614746094, "learning_rate": 9.906354515050168e-05, "loss": 0.0938, "step": 140 }, { "epoch": 0.10033444816053512, "grad_norm": 3.0998451709747314, "learning_rate": 9.899665551839465e-05, "loss": 0.0681, "step": 150 }, { "epoch": 0.10702341137123746, "grad_norm": 5.554123878479004, "learning_rate": 9.892976588628763e-05, "loss": 0.0726, "step": 160 }, { "epoch": 0.11371237458193979, "grad_norm": 1.8740187883377075, "learning_rate": 9.88628762541806e-05, "loss": 0.0887, "step": 170 }, { "epoch": 0.12040133779264214, "grad_norm": 9.424790382385254, "learning_rate": 9.879598662207358e-05, "loss": 0.1184, "step": 180 }, { "epoch": 0.12709030100334448, "grad_norm": 0.5133495330810547, "learning_rate": 9.872909698996656e-05, "loss": 0.0907, "step": 190 }, { "epoch": 0.13377926421404682, "grad_norm": 12.215278625488281, "learning_rate": 9.866220735785953e-05, "loss": 0.138, "step": 200 }, { "epoch": 0.14046822742474915, "grad_norm": 14.582442283630371, "learning_rate": 9.859531772575251e-05, "loss": 0.1946, "step": 210 }, { "epoch": 0.14715719063545152, "grad_norm": 2.9713079929351807, "learning_rate": 9.852842809364548e-05, "loss": 0.1018, "step": 220 }, { "epoch": 0.15384615384615385, "grad_norm": 2.692634344100952, "learning_rate": 9.846153846153848e-05, "loss": 0.1059, "step": 230 }, { "epoch": 0.1605351170568562, "grad_norm": 3.160918712615967, "learning_rate": 9.839464882943145e-05, "loss": 0.1153, "step": 240 }, { "epoch": 0.16722408026755853, "grad_norm": 4.2426557540893555, "learning_rate": 9.832775919732441e-05, "loss": 0.0746, "step": 250 }, { "epoch": 0.17391304347826086, "grad_norm": 0.8495791554450989, "learning_rate": 9.82608695652174e-05, "loss": 0.087, "step": 260 }, { "epoch": 0.1806020066889632, "grad_norm": 4.695148468017578, "learning_rate": 9.819397993311036e-05, "loss": 0.0933, "step": 270 }, { "epoch": 0.18729096989966554, "grad_norm": 1.0231961011886597, "learning_rate": 9.812709030100335e-05, "loss": 0.0762, "step": 280 }, { "epoch": 0.1939799331103679, "grad_norm": 0.4477454423904419, "learning_rate": 9.806020066889633e-05, "loss": 0.0817, "step": 290 }, { "epoch": 0.20066889632107024, "grad_norm": 3.472343683242798, "learning_rate": 9.799331103678931e-05, "loss": 0.0788, "step": 300 }, { "epoch": 0.20735785953177258, "grad_norm": 2.307664632797241, "learning_rate": 9.792642140468228e-05, "loss": 0.0777, "step": 310 }, { "epoch": 0.2140468227424749, "grad_norm": 2.007056951522827, "learning_rate": 9.785953177257525e-05, "loss": 0.0708, "step": 320 }, { "epoch": 0.22073578595317725, "grad_norm": 8.99911880493164, "learning_rate": 9.779264214046823e-05, "loss": 0.07, "step": 330 }, { "epoch": 0.22742474916387959, "grad_norm": 0.8066081404685974, "learning_rate": 9.772575250836121e-05, "loss": 0.0807, "step": 340 }, { "epoch": 0.23411371237458195, "grad_norm": 9.597452163696289, "learning_rate": 9.765886287625419e-05, "loss": 0.0992, "step": 350 }, { "epoch": 0.2408026755852843, "grad_norm": 0.36006277799606323, "learning_rate": 9.759197324414716e-05, "loss": 0.0892, "step": 360 }, { "epoch": 0.24749163879598662, "grad_norm": 6.198542594909668, "learning_rate": 9.752508361204014e-05, "loss": 0.0894, "step": 370 }, { "epoch": 0.25418060200668896, "grad_norm": 1.0550386905670166, "learning_rate": 9.745819397993311e-05, "loss": 0.0836, "step": 380 }, { "epoch": 0.2608695652173913, "grad_norm": 3.5416440963745117, "learning_rate": 9.739130434782609e-05, "loss": 0.0833, "step": 390 }, { "epoch": 0.26755852842809363, "grad_norm": 0.4253082573413849, "learning_rate": 9.732441471571907e-05, "loss": 0.0899, "step": 400 }, { "epoch": 0.27424749163879597, "grad_norm": 0.3868394196033478, "learning_rate": 9.725752508361204e-05, "loss": 0.0776, "step": 410 }, { "epoch": 0.2809364548494983, "grad_norm": 3.237781286239624, "learning_rate": 9.719063545150502e-05, "loss": 0.0927, "step": 420 }, { "epoch": 0.28762541806020064, "grad_norm": 2.2645530700683594, "learning_rate": 9.712374581939799e-05, "loss": 0.1582, "step": 430 }, { "epoch": 0.29431438127090304, "grad_norm": 4.430079460144043, "learning_rate": 9.705685618729097e-05, "loss": 0.0908, "step": 440 }, { "epoch": 0.3010033444816054, "grad_norm": 1.9928741455078125, "learning_rate": 9.698996655518396e-05, "loss": 0.0951, "step": 450 }, { "epoch": 0.3076923076923077, "grad_norm": 1.939744472503662, "learning_rate": 9.692307692307692e-05, "loss": 0.0727, "step": 460 }, { "epoch": 0.31438127090301005, "grad_norm": 8.799491882324219, "learning_rate": 9.68561872909699e-05, "loss": 0.1026, "step": 470 }, { "epoch": 0.3210702341137124, "grad_norm": 2.6402878761291504, "learning_rate": 9.678929765886287e-05, "loss": 0.1168, "step": 480 }, { "epoch": 0.3277591973244147, "grad_norm": 0.3376615345478058, "learning_rate": 9.672240802675586e-05, "loss": 0.0732, "step": 490 }, { "epoch": 0.33444816053511706, "grad_norm": 7.645970821380615, "learning_rate": 9.665551839464884e-05, "loss": 0.0762, "step": 500 }, { "epoch": 0.3411371237458194, "grad_norm": 0.24924737215042114, "learning_rate": 9.65886287625418e-05, "loss": 0.0831, "step": 510 }, { "epoch": 0.34782608695652173, "grad_norm": 3.0840044021606445, "learning_rate": 9.652173913043479e-05, "loss": 0.0865, "step": 520 }, { "epoch": 0.35451505016722407, "grad_norm": 0.9078582525253296, "learning_rate": 9.645484949832776e-05, "loss": 0.0911, "step": 530 }, { "epoch": 0.3612040133779264, "grad_norm": 1.0846281051635742, "learning_rate": 9.638795986622075e-05, "loss": 0.1001, "step": 540 }, { "epoch": 0.36789297658862874, "grad_norm": 1.852506399154663, "learning_rate": 9.632107023411372e-05, "loss": 0.0888, "step": 550 }, { "epoch": 0.3745819397993311, "grad_norm": 2.859666347503662, "learning_rate": 9.625418060200669e-05, "loss": 0.0911, "step": 560 }, { "epoch": 0.38127090301003347, "grad_norm": 1.9279602766036987, "learning_rate": 9.618729096989967e-05, "loss": 0.0864, "step": 570 }, { "epoch": 0.3879598662207358, "grad_norm": 8.074692726135254, "learning_rate": 9.612040133779264e-05, "loss": 0.0692, "step": 580 }, { "epoch": 0.39464882943143814, "grad_norm": 0.68867427110672, "learning_rate": 9.605351170568563e-05, "loss": 0.0787, "step": 590 }, { "epoch": 0.4013377926421405, "grad_norm": 5.155614376068115, "learning_rate": 9.59866220735786e-05, "loss": 0.0814, "step": 600 }, { "epoch": 0.4080267558528428, "grad_norm": 4.687347412109375, "learning_rate": 9.591973244147158e-05, "loss": 0.0845, "step": 610 }, { "epoch": 0.41471571906354515, "grad_norm": 6.637145042419434, "learning_rate": 9.585284280936455e-05, "loss": 0.0934, "step": 620 }, { "epoch": 0.4214046822742475, "grad_norm": 1.045323133468628, "learning_rate": 9.578595317725752e-05, "loss": 0.0718, "step": 630 }, { "epoch": 0.4280936454849498, "grad_norm": 7.606454849243164, "learning_rate": 9.571906354515052e-05, "loss": 0.0983, "step": 640 }, { "epoch": 0.43478260869565216, "grad_norm": 4.973568916320801, "learning_rate": 9.565217391304348e-05, "loss": 0.0979, "step": 650 }, { "epoch": 0.4414715719063545, "grad_norm": 4.984686851501465, "learning_rate": 9.558528428093647e-05, "loss": 0.0808, "step": 660 }, { "epoch": 0.44816053511705684, "grad_norm": 5.981488227844238, "learning_rate": 9.551839464882943e-05, "loss": 0.089, "step": 670 }, { "epoch": 0.45484949832775917, "grad_norm": 1.167374610900879, "learning_rate": 9.545150501672242e-05, "loss": 0.0829, "step": 680 }, { "epoch": 0.46153846153846156, "grad_norm": 0.4526713192462921, "learning_rate": 9.53846153846154e-05, "loss": 0.0691, "step": 690 }, { "epoch": 0.4682274247491639, "grad_norm": 8.227622032165527, "learning_rate": 9.531772575250837e-05, "loss": 0.0949, "step": 700 }, { "epoch": 0.47491638795986624, "grad_norm": 0.20492926239967346, "learning_rate": 9.525083612040135e-05, "loss": 0.0778, "step": 710 }, { "epoch": 0.4816053511705686, "grad_norm": 3.324284315109253, "learning_rate": 9.518394648829432e-05, "loss": 0.0944, "step": 720 }, { "epoch": 0.4882943143812709, "grad_norm": 5.781021595001221, "learning_rate": 9.51170568561873e-05, "loss": 0.0906, "step": 730 }, { "epoch": 0.49498327759197325, "grad_norm": 0.3161633014678955, "learning_rate": 9.505016722408028e-05, "loss": 0.0878, "step": 740 }, { "epoch": 0.5016722408026756, "grad_norm": 0.9887397289276123, "learning_rate": 9.498327759197325e-05, "loss": 0.0766, "step": 750 }, { "epoch": 0.5083612040133779, "grad_norm": 0.4013742506504059, "learning_rate": 9.491638795986623e-05, "loss": 0.0828, "step": 760 }, { "epoch": 0.5150501672240803, "grad_norm": 4.095464706420898, "learning_rate": 9.48494983277592e-05, "loss": 0.0941, "step": 770 }, { "epoch": 0.5217391304347826, "grad_norm": 10.108964920043945, "learning_rate": 9.478260869565218e-05, "loss": 0.0914, "step": 780 }, { "epoch": 0.5284280936454849, "grad_norm": 7.85491943359375, "learning_rate": 9.471571906354515e-05, "loss": 0.0985, "step": 790 }, { "epoch": 0.5351170568561873, "grad_norm": 0.8438981771469116, "learning_rate": 9.464882943143813e-05, "loss": 0.0743, "step": 800 }, { "epoch": 0.5418060200668896, "grad_norm": 1.7622973918914795, "learning_rate": 9.458193979933111e-05, "loss": 0.095, "step": 810 }, { "epoch": 0.5484949832775919, "grad_norm": 3.505446672439575, "learning_rate": 9.451505016722408e-05, "loss": 0.0811, "step": 820 }, { "epoch": 0.5551839464882943, "grad_norm": 0.6634843349456787, "learning_rate": 9.444816053511706e-05, "loss": 0.0917, "step": 830 }, { "epoch": 0.5618729096989966, "grad_norm": 4.3299880027771, "learning_rate": 9.438127090301003e-05, "loss": 0.077, "step": 840 }, { "epoch": 0.568561872909699, "grad_norm": 3.079010009765625, "learning_rate": 9.431438127090302e-05, "loss": 0.0775, "step": 850 }, { "epoch": 0.5752508361204013, "grad_norm": 3.117349863052368, "learning_rate": 9.424749163879599e-05, "loss": 0.0581, "step": 860 }, { "epoch": 0.5819397993311036, "grad_norm": 7.100534915924072, "learning_rate": 9.418060200668896e-05, "loss": 0.0963, "step": 870 }, { "epoch": 0.5886287625418061, "grad_norm": 2.5692174434661865, "learning_rate": 9.411371237458194e-05, "loss": 0.0813, "step": 880 }, { "epoch": 0.5953177257525084, "grad_norm": 5.7321271896362305, "learning_rate": 9.404682274247491e-05, "loss": 0.0824, "step": 890 }, { "epoch": 0.6020066889632107, "grad_norm": 0.9095240831375122, "learning_rate": 9.39799331103679e-05, "loss": 0.0656, "step": 900 }, { "epoch": 0.6086956521739131, "grad_norm": 2.5167980194091797, "learning_rate": 9.391304347826087e-05, "loss": 0.0745, "step": 910 }, { "epoch": 0.6153846153846154, "grad_norm": 0.2390034794807434, "learning_rate": 9.384615384615386e-05, "loss": 0.0896, "step": 920 }, { "epoch": 0.6220735785953178, "grad_norm": 2.4616341590881348, "learning_rate": 9.377926421404682e-05, "loss": 0.0833, "step": 930 }, { "epoch": 0.6287625418060201, "grad_norm": 10.120244026184082, "learning_rate": 9.371237458193979e-05, "loss": 0.0887, "step": 940 }, { "epoch": 0.6354515050167224, "grad_norm": 2.8883373737335205, "learning_rate": 9.364548494983279e-05, "loss": 0.1058, "step": 950 }, { "epoch": 0.6421404682274248, "grad_norm": 7.05233097076416, "learning_rate": 9.357859531772576e-05, "loss": 0.087, "step": 960 }, { "epoch": 0.6488294314381271, "grad_norm": 8.142528533935547, "learning_rate": 9.351170568561874e-05, "loss": 0.1047, "step": 970 }, { "epoch": 0.6555183946488294, "grad_norm": 6.927243232727051, "learning_rate": 9.344481605351171e-05, "loss": 0.088, "step": 980 }, { "epoch": 0.6622073578595318, "grad_norm": 6.964981555938721, "learning_rate": 9.337792642140469e-05, "loss": 0.0822, "step": 990 }, { "epoch": 0.6688963210702341, "grad_norm": 0.3616563081741333, "learning_rate": 9.331103678929767e-05, "loss": 0.0849, "step": 1000 }, { "epoch": 0.6755852842809364, "grad_norm": 0.9265075922012329, "learning_rate": 9.324414715719064e-05, "loss": 0.0885, "step": 1010 }, { "epoch": 0.6822742474916388, "grad_norm": 2.175069570541382, "learning_rate": 9.317725752508362e-05, "loss": 0.0609, "step": 1020 }, { "epoch": 0.6889632107023411, "grad_norm": 0.4830019474029541, "learning_rate": 9.311036789297659e-05, "loss": 0.069, "step": 1030 }, { "epoch": 0.6956521739130435, "grad_norm": 10.464764595031738, "learning_rate": 9.304347826086957e-05, "loss": 0.0956, "step": 1040 }, { "epoch": 0.7023411371237458, "grad_norm": 4.139832973480225, "learning_rate": 9.297658862876255e-05, "loss": 0.0925, "step": 1050 }, { "epoch": 0.7090301003344481, "grad_norm": 2.826423406600952, "learning_rate": 9.290969899665552e-05, "loss": 0.0898, "step": 1060 }, { "epoch": 0.7157190635451505, "grad_norm": 5.344399452209473, "learning_rate": 9.28428093645485e-05, "loss": 0.0696, "step": 1070 }, { "epoch": 0.7224080267558528, "grad_norm": 3.084770441055298, "learning_rate": 9.277591973244147e-05, "loss": 0.0643, "step": 1080 }, { "epoch": 0.7290969899665551, "grad_norm": 0.4706350862979889, "learning_rate": 9.270903010033445e-05, "loss": 0.0819, "step": 1090 }, { "epoch": 0.7357859531772575, "grad_norm": 3.9289710521698, "learning_rate": 9.264214046822743e-05, "loss": 0.1029, "step": 1100 }, { "epoch": 0.7424749163879598, "grad_norm": 0.404439240694046, "learning_rate": 9.25752508361204e-05, "loss": 0.0877, "step": 1110 }, { "epoch": 0.7491638795986622, "grad_norm": 7.014498710632324, "learning_rate": 9.250836120401338e-05, "loss": 0.0883, "step": 1120 }, { "epoch": 0.7558528428093646, "grad_norm": 5.416506290435791, "learning_rate": 9.244147157190635e-05, "loss": 0.1351, "step": 1130 }, { "epoch": 0.7625418060200669, "grad_norm": 5.6372904777526855, "learning_rate": 9.237458193979933e-05, "loss": 0.0914, "step": 1140 }, { "epoch": 0.7692307692307693, "grad_norm": 2.7677652835845947, "learning_rate": 9.230769230769232e-05, "loss": 0.0809, "step": 1150 }, { "epoch": 0.7759197324414716, "grad_norm": 5.408329010009766, "learning_rate": 9.22408026755853e-05, "loss": 0.0739, "step": 1160 }, { "epoch": 0.782608695652174, "grad_norm": 0.7951474189758301, "learning_rate": 9.217391304347827e-05, "loss": 0.0703, "step": 1170 }, { "epoch": 0.7892976588628763, "grad_norm": 8.31872844696045, "learning_rate": 9.210702341137123e-05, "loss": 0.0777, "step": 1180 }, { "epoch": 0.7959866220735786, "grad_norm": 1.7713472843170166, "learning_rate": 9.204013377926422e-05, "loss": 0.0626, "step": 1190 }, { "epoch": 0.802675585284281, "grad_norm": 5.379355430603027, "learning_rate": 9.19732441471572e-05, "loss": 0.0763, "step": 1200 }, { "epoch": 0.8093645484949833, "grad_norm": 6.51313591003418, "learning_rate": 9.190635451505018e-05, "loss": 0.0869, "step": 1210 }, { "epoch": 0.8160535117056856, "grad_norm": 3.0633490085601807, "learning_rate": 9.183946488294315e-05, "loss": 0.0807, "step": 1220 }, { "epoch": 0.822742474916388, "grad_norm": 1.5585048198699951, "learning_rate": 9.177257525083613e-05, "loss": 0.0714, "step": 1230 }, { "epoch": 0.8294314381270903, "grad_norm": 1.0530630350112915, "learning_rate": 9.17056856187291e-05, "loss": 0.0777, "step": 1240 }, { "epoch": 0.8361204013377926, "grad_norm": 4.880084991455078, "learning_rate": 9.163879598662207e-05, "loss": 0.062, "step": 1250 }, { "epoch": 0.842809364548495, "grad_norm": 4.956668853759766, "learning_rate": 9.157190635451506e-05, "loss": 0.0936, "step": 1260 }, { "epoch": 0.8494983277591973, "grad_norm": 4.809473037719727, "learning_rate": 9.150501672240803e-05, "loss": 0.0944, "step": 1270 }, { "epoch": 0.8561872909698997, "grad_norm": 4.575435638427734, "learning_rate": 9.143812709030101e-05, "loss": 0.0795, "step": 1280 }, { "epoch": 0.862876254180602, "grad_norm": 5.542465686798096, "learning_rate": 9.137123745819398e-05, "loss": 0.0902, "step": 1290 }, { "epoch": 0.8695652173913043, "grad_norm": 2.882326364517212, "learning_rate": 9.130434782608696e-05, "loss": 0.077, "step": 1300 }, { "epoch": 0.8762541806020067, "grad_norm": 5.928070545196533, "learning_rate": 9.123745819397994e-05, "loss": 0.0959, "step": 1310 }, { "epoch": 0.882943143812709, "grad_norm": 5.219929218292236, "learning_rate": 9.117056856187291e-05, "loss": 0.1166, "step": 1320 }, { "epoch": 0.8896321070234113, "grad_norm": 2.3765580654144287, "learning_rate": 9.11036789297659e-05, "loss": 0.0778, "step": 1330 }, { "epoch": 0.8963210702341137, "grad_norm": 5.088015079498291, "learning_rate": 9.103678929765886e-05, "loss": 0.0715, "step": 1340 }, { "epoch": 0.903010033444816, "grad_norm": 2.053117513656616, "learning_rate": 9.096989966555184e-05, "loss": 0.0676, "step": 1350 }, { "epoch": 0.9096989966555183, "grad_norm": 4.23502254486084, "learning_rate": 9.090301003344483e-05, "loss": 0.0972, "step": 1360 }, { "epoch": 0.9163879598662207, "grad_norm": 1.1127852201461792, "learning_rate": 9.08361204013378e-05, "loss": 0.0889, "step": 1370 }, { "epoch": 0.9230769230769231, "grad_norm": 2.8048794269561768, "learning_rate": 9.076923076923078e-05, "loss": 0.0753, "step": 1380 }, { "epoch": 0.9297658862876255, "grad_norm": 2.873419761657715, "learning_rate": 9.070234113712374e-05, "loss": 0.0755, "step": 1390 }, { "epoch": 0.9364548494983278, "grad_norm": 1.651400089263916, "learning_rate": 9.063545150501673e-05, "loss": 0.0861, "step": 1400 }, { "epoch": 0.9431438127090301, "grad_norm": 7.603207111358643, "learning_rate": 9.056856187290971e-05, "loss": 0.0839, "step": 1410 }, { "epoch": 0.9498327759197325, "grad_norm": 3.371145009994507, "learning_rate": 9.050167224080268e-05, "loss": 0.0841, "step": 1420 }, { "epoch": 0.9565217391304348, "grad_norm": 2.119283437728882, "learning_rate": 9.043478260869566e-05, "loss": 0.0654, "step": 1430 }, { "epoch": 0.9632107023411371, "grad_norm": 3.469703435897827, "learning_rate": 9.036789297658863e-05, "loss": 0.0876, "step": 1440 }, { "epoch": 0.9698996655518395, "grad_norm": 4.874169826507568, "learning_rate": 9.030100334448161e-05, "loss": 0.0776, "step": 1450 }, { "epoch": 0.9765886287625418, "grad_norm": 7.210989952087402, "learning_rate": 9.023411371237459e-05, "loss": 0.0905, "step": 1460 }, { "epoch": 0.9832775919732442, "grad_norm": 2.3983662128448486, "learning_rate": 9.016722408026757e-05, "loss": 0.0847, "step": 1470 }, { "epoch": 0.9899665551839465, "grad_norm": 5.223353862762451, "learning_rate": 9.010033444816054e-05, "loss": 0.0877, "step": 1480 }, { "epoch": 0.9966555183946488, "grad_norm": 0.9329972863197327, "learning_rate": 9.003344481605351e-05, "loss": 0.0816, "step": 1490 }, { "epoch": 1.0, "eval_loss": 0.0947796180844307, "eval_mse": 0.0947796180844307, "eval_runtime": 227.7814, "eval_samples_per_second": 13.127, "eval_steps_per_second": 1.642, "step": 1495 }, { "epoch": 1.0033444816053512, "grad_norm": 9.278254508972168, "learning_rate": 8.996655518394649e-05, "loss": 0.0897, "step": 1500 }, { "epoch": 1.0100334448160535, "grad_norm": 3.634450912475586, "learning_rate": 8.989966555183947e-05, "loss": 0.0784, "step": 1510 }, { "epoch": 1.0167224080267558, "grad_norm": 1.6217931509017944, "learning_rate": 8.983277591973245e-05, "loss": 0.086, "step": 1520 }, { "epoch": 1.0234113712374582, "grad_norm": 3.606285333633423, "learning_rate": 8.976588628762542e-05, "loss": 0.0768, "step": 1530 }, { "epoch": 1.0301003344481605, "grad_norm": 1.5076968669891357, "learning_rate": 8.96989966555184e-05, "loss": 0.0859, "step": 1540 }, { "epoch": 1.0367892976588629, "grad_norm": 6.945788383483887, "learning_rate": 8.963210702341137e-05, "loss": 0.0792, "step": 1550 }, { "epoch": 1.0434782608695652, "grad_norm": 0.16002212464809418, "learning_rate": 8.956521739130435e-05, "loss": 0.0613, "step": 1560 }, { "epoch": 1.0501672240802675, "grad_norm": 0.5019150972366333, "learning_rate": 8.949832775919734e-05, "loss": 0.0794, "step": 1570 }, { "epoch": 1.0568561872909699, "grad_norm": 3.1986889839172363, "learning_rate": 8.94314381270903e-05, "loss": 0.0671, "step": 1580 }, { "epoch": 1.0635451505016722, "grad_norm": 0.7359128594398499, "learning_rate": 8.936454849498329e-05, "loss": 0.0867, "step": 1590 }, { "epoch": 1.0702341137123745, "grad_norm": 3.642479419708252, "learning_rate": 8.929765886287625e-05, "loss": 0.0934, "step": 1600 }, { "epoch": 1.0769230769230769, "grad_norm": 2.9425642490386963, "learning_rate": 8.923076923076924e-05, "loss": 0.1037, "step": 1610 }, { "epoch": 1.0836120401337792, "grad_norm": 2.242279052734375, "learning_rate": 8.916387959866222e-05, "loss": 0.0793, "step": 1620 }, { "epoch": 1.0903010033444815, "grad_norm": 3.2683675289154053, "learning_rate": 8.909698996655519e-05, "loss": 0.0895, "step": 1630 }, { "epoch": 1.0969899665551839, "grad_norm": 0.4252338707447052, "learning_rate": 8.903010033444817e-05, "loss": 0.0727, "step": 1640 }, { "epoch": 1.1036789297658862, "grad_norm": 5.4478373527526855, "learning_rate": 8.896321070234114e-05, "loss": 0.0795, "step": 1650 }, { "epoch": 1.1103678929765886, "grad_norm": 3.975977659225464, "learning_rate": 8.889632107023412e-05, "loss": 0.0705, "step": 1660 }, { "epoch": 1.117056856187291, "grad_norm": 4.362052917480469, "learning_rate": 8.88294314381271e-05, "loss": 0.0787, "step": 1670 }, { "epoch": 1.1237458193979932, "grad_norm": 3.969586133956909, "learning_rate": 8.876254180602007e-05, "loss": 0.0942, "step": 1680 }, { "epoch": 1.1304347826086956, "grad_norm": 1.0709367990493774, "learning_rate": 8.869565217391305e-05, "loss": 0.0834, "step": 1690 }, { "epoch": 1.137123745819398, "grad_norm": 1.190765619277954, "learning_rate": 8.862876254180602e-05, "loss": 0.0827, "step": 1700 }, { "epoch": 1.1438127090301002, "grad_norm": 7.36898136138916, "learning_rate": 8.856187290969901e-05, "loss": 0.1021, "step": 1710 }, { "epoch": 1.1505016722408028, "grad_norm": 3.5522820949554443, "learning_rate": 8.849498327759198e-05, "loss": 0.0856, "step": 1720 }, { "epoch": 1.1571906354515051, "grad_norm": 5.038424015045166, "learning_rate": 8.842809364548495e-05, "loss": 0.0845, "step": 1730 }, { "epoch": 1.1638795986622075, "grad_norm": 3.5751819610595703, "learning_rate": 8.836120401337793e-05, "loss": 0.077, "step": 1740 }, { "epoch": 1.1705685618729098, "grad_norm": 5.335000991821289, "learning_rate": 8.82943143812709e-05, "loss": 0.0886, "step": 1750 }, { "epoch": 1.1772575250836121, "grad_norm": 3.961160898208618, "learning_rate": 8.822742474916388e-05, "loss": 0.0715, "step": 1760 }, { "epoch": 1.1839464882943145, "grad_norm": 2.157200574874878, "learning_rate": 8.816053511705686e-05, "loss": 0.0882, "step": 1770 }, { "epoch": 1.1906354515050168, "grad_norm": 6.394442081451416, "learning_rate": 8.809364548494984e-05, "loss": 0.0808, "step": 1780 }, { "epoch": 1.1973244147157192, "grad_norm": 1.3427674770355225, "learning_rate": 8.802675585284281e-05, "loss": 0.0606, "step": 1790 }, { "epoch": 1.2040133779264215, "grad_norm": 0.8644570708274841, "learning_rate": 8.795986622073578e-05, "loss": 0.0782, "step": 1800 }, { "epoch": 1.2107023411371238, "grad_norm": 1.9095417261123657, "learning_rate": 8.789297658862876e-05, "loss": 0.0843, "step": 1810 }, { "epoch": 1.2173913043478262, "grad_norm": 5.084800720214844, "learning_rate": 8.782608695652174e-05, "loss": 0.1011, "step": 1820 }, { "epoch": 1.2240802675585285, "grad_norm": 1.1336281299591064, "learning_rate": 8.775919732441473e-05, "loss": 0.1059, "step": 1830 }, { "epoch": 1.2307692307692308, "grad_norm": 2.755981683731079, "learning_rate": 8.76923076923077e-05, "loss": 0.0657, "step": 1840 }, { "epoch": 1.2374581939799332, "grad_norm": 1.1654160022735596, "learning_rate": 8.762541806020068e-05, "loss": 0.079, "step": 1850 }, { "epoch": 1.2441471571906355, "grad_norm": 2.7007975578308105, "learning_rate": 8.755852842809365e-05, "loss": 0.0932, "step": 1860 }, { "epoch": 1.2508361204013378, "grad_norm": 4.843238830566406, "learning_rate": 8.749163879598663e-05, "loss": 0.071, "step": 1870 }, { "epoch": 1.2575250836120402, "grad_norm": 5.805212020874023, "learning_rate": 8.742474916387961e-05, "loss": 0.0814, "step": 1880 }, { "epoch": 1.2642140468227425, "grad_norm": 3.9875166416168213, "learning_rate": 8.735785953177258e-05, "loss": 0.0848, "step": 1890 }, { "epoch": 1.2709030100334449, "grad_norm": 0.26979923248291016, "learning_rate": 8.729096989966556e-05, "loss": 0.0797, "step": 1900 }, { "epoch": 1.2775919732441472, "grad_norm": 4.210855007171631, "learning_rate": 8.722408026755853e-05, "loss": 0.074, "step": 1910 }, { "epoch": 1.2842809364548495, "grad_norm": 6.115736484527588, "learning_rate": 8.715719063545151e-05, "loss": 0.0752, "step": 1920 }, { "epoch": 1.2909698996655519, "grad_norm": 0.48328760266304016, "learning_rate": 8.709030100334449e-05, "loss": 0.0744, "step": 1930 }, { "epoch": 1.2976588628762542, "grad_norm": 0.3940099775791168, "learning_rate": 8.702341137123746e-05, "loss": 0.0846, "step": 1940 }, { "epoch": 1.3043478260869565, "grad_norm": 6.541823863983154, "learning_rate": 8.695652173913044e-05, "loss": 0.0767, "step": 1950 }, { "epoch": 1.3110367892976589, "grad_norm": 7.311049461364746, "learning_rate": 8.688963210702341e-05, "loss": 0.0826, "step": 1960 }, { "epoch": 1.3177257525083612, "grad_norm": 1.2747488021850586, "learning_rate": 8.682274247491639e-05, "loss": 0.0677, "step": 1970 }, { "epoch": 1.3244147157190636, "grad_norm": 9.177070617675781, "learning_rate": 8.675585284280937e-05, "loss": 0.0675, "step": 1980 }, { "epoch": 1.3311036789297659, "grad_norm": 0.9875199198722839, "learning_rate": 8.668896321070234e-05, "loss": 0.072, "step": 1990 }, { "epoch": 1.3377926421404682, "grad_norm": 5.233514308929443, "learning_rate": 8.662207357859532e-05, "loss": 0.0734, "step": 2000 }, { "epoch": 1.3444816053511706, "grad_norm": 3.531949996948242, "learning_rate": 8.655518394648829e-05, "loss": 0.0979, "step": 2010 }, { "epoch": 1.351170568561873, "grad_norm": 1.9295690059661865, "learning_rate": 8.648829431438129e-05, "loss": 0.0838, "step": 2020 }, { "epoch": 1.3578595317725752, "grad_norm": 5.854907989501953, "learning_rate": 8.642140468227425e-05, "loss": 0.1134, "step": 2030 }, { "epoch": 1.3645484949832776, "grad_norm": 2.4425508975982666, "learning_rate": 8.635451505016722e-05, "loss": 0.0727, "step": 2040 }, { "epoch": 1.37123745819398, "grad_norm": 5.542223930358887, "learning_rate": 8.62876254180602e-05, "loss": 0.0816, "step": 2050 }, { "epoch": 1.3779264214046822, "grad_norm": 1.6907216310501099, "learning_rate": 8.622073578595317e-05, "loss": 0.0907, "step": 2060 }, { "epoch": 1.3846153846153846, "grad_norm": 4.605374813079834, "learning_rate": 8.615384615384617e-05, "loss": 0.081, "step": 2070 }, { "epoch": 1.391304347826087, "grad_norm": 4.201385498046875, "learning_rate": 8.608695652173914e-05, "loss": 0.1161, "step": 2080 }, { "epoch": 1.3979933110367893, "grad_norm": 2.6294329166412354, "learning_rate": 8.602006688963212e-05, "loss": 0.0685, "step": 2090 }, { "epoch": 1.4046822742474916, "grad_norm": 0.2493625432252884, "learning_rate": 8.595317725752509e-05, "loss": 0.0683, "step": 2100 }, { "epoch": 1.411371237458194, "grad_norm": 3.8422601222991943, "learning_rate": 8.588628762541805e-05, "loss": 0.094, "step": 2110 }, { "epoch": 1.4180602006688963, "grad_norm": 1.6194677352905273, "learning_rate": 8.581939799331105e-05, "loss": 0.078, "step": 2120 }, { "epoch": 1.4247491638795986, "grad_norm": 3.9038643836975098, "learning_rate": 8.575250836120402e-05, "loss": 0.0675, "step": 2130 }, { "epoch": 1.431438127090301, "grad_norm": 6.922024250030518, "learning_rate": 8.5685618729097e-05, "loss": 0.082, "step": 2140 }, { "epoch": 1.4381270903010033, "grad_norm": 1.8225903511047363, "learning_rate": 8.561872909698997e-05, "loss": 0.0935, "step": 2150 }, { "epoch": 1.4448160535117056, "grad_norm": 9.204913139343262, "learning_rate": 8.555183946488295e-05, "loss": 0.0878, "step": 2160 }, { "epoch": 1.451505016722408, "grad_norm": 3.7089452743530273, "learning_rate": 8.548494983277593e-05, "loss": 0.0776, "step": 2170 }, { "epoch": 1.4581939799331103, "grad_norm": 4.970576763153076, "learning_rate": 8.54180602006689e-05, "loss": 0.0956, "step": 2180 }, { "epoch": 1.4648829431438126, "grad_norm": 4.007099151611328, "learning_rate": 8.535117056856188e-05, "loss": 0.0937, "step": 2190 }, { "epoch": 1.471571906354515, "grad_norm": 4.514796733856201, "learning_rate": 8.528428093645485e-05, "loss": 0.0773, "step": 2200 }, { "epoch": 1.4782608695652173, "grad_norm": 2.7079756259918213, "learning_rate": 8.521739130434783e-05, "loss": 0.0848, "step": 2210 }, { "epoch": 1.4849498327759196, "grad_norm": 0.419909805059433, "learning_rate": 8.515050167224081e-05, "loss": 0.0627, "step": 2220 }, { "epoch": 1.491638795986622, "grad_norm": 0.8812950849533081, "learning_rate": 8.508361204013378e-05, "loss": 0.0624, "step": 2230 }, { "epoch": 1.4983277591973243, "grad_norm": 2.6033947467803955, "learning_rate": 8.501672240802676e-05, "loss": 0.0989, "step": 2240 }, { "epoch": 1.5050167224080266, "grad_norm": 5.632299423217773, "learning_rate": 8.494983277591973e-05, "loss": 0.0938, "step": 2250 }, { "epoch": 1.511705685618729, "grad_norm": 2.372530221939087, "learning_rate": 8.488294314381271e-05, "loss": 0.0877, "step": 2260 }, { "epoch": 1.5183946488294313, "grad_norm": 3.246079921722412, "learning_rate": 8.481605351170568e-05, "loss": 0.0826, "step": 2270 }, { "epoch": 1.5250836120401337, "grad_norm": 3.317453384399414, "learning_rate": 8.474916387959866e-05, "loss": 0.0775, "step": 2280 }, { "epoch": 1.531772575250836, "grad_norm": 4.335958003997803, "learning_rate": 8.468227424749165e-05, "loss": 0.0666, "step": 2290 }, { "epoch": 1.5384615384615383, "grad_norm": 3.737240791320801, "learning_rate": 8.461538461538461e-05, "loss": 0.0647, "step": 2300 }, { "epoch": 1.5451505016722407, "grad_norm": 1.582314372062683, "learning_rate": 8.45484949832776e-05, "loss": 0.0834, "step": 2310 }, { "epoch": 1.551839464882943, "grad_norm": 3.502819061279297, "learning_rate": 8.448160535117056e-05, "loss": 0.0905, "step": 2320 }, { "epoch": 1.5585284280936453, "grad_norm": 0.8796928524971008, "learning_rate": 8.441471571906356e-05, "loss": 0.0752, "step": 2330 }, { "epoch": 1.5652173913043477, "grad_norm": 3.127408742904663, "learning_rate": 8.434782608695653e-05, "loss": 0.0596, "step": 2340 }, { "epoch": 1.57190635451505, "grad_norm": 2.658625602722168, "learning_rate": 8.42809364548495e-05, "loss": 0.091, "step": 2350 }, { "epoch": 1.5785953177257523, "grad_norm": 1.1064062118530273, "learning_rate": 8.421404682274248e-05, "loss": 0.0784, "step": 2360 }, { "epoch": 1.585284280936455, "grad_norm": 5.8419389724731445, "learning_rate": 8.414715719063545e-05, "loss": 0.085, "step": 2370 }, { "epoch": 1.5919732441471572, "grad_norm": 3.167968273162842, "learning_rate": 8.408026755852844e-05, "loss": 0.1401, "step": 2380 }, { "epoch": 1.5986622073578596, "grad_norm": 6.581013202667236, "learning_rate": 8.401337792642141e-05, "loss": 0.0865, "step": 2390 }, { "epoch": 1.605351170568562, "grad_norm": 0.5135999321937561, "learning_rate": 8.394648829431439e-05, "loss": 0.0883, "step": 2400 }, { "epoch": 1.6120401337792643, "grad_norm": 1.0057544708251953, "learning_rate": 8.387959866220736e-05, "loss": 0.1091, "step": 2410 }, { "epoch": 1.6187290969899666, "grad_norm": 1.3300502300262451, "learning_rate": 8.381270903010033e-05, "loss": 0.0762, "step": 2420 }, { "epoch": 1.625418060200669, "grad_norm": 10.380025863647461, "learning_rate": 8.374581939799332e-05, "loss": 0.0745, "step": 2430 }, { "epoch": 1.6321070234113713, "grad_norm": 0.41489875316619873, "learning_rate": 8.367892976588629e-05, "loss": 0.0659, "step": 2440 }, { "epoch": 1.6387959866220736, "grad_norm": 2.31946063041687, "learning_rate": 8.361204013377927e-05, "loss": 0.069, "step": 2450 }, { "epoch": 1.645484949832776, "grad_norm": 3.089249849319458, "learning_rate": 8.354515050167224e-05, "loss": 0.0932, "step": 2460 }, { "epoch": 1.6521739130434783, "grad_norm": 1.3205960988998413, "learning_rate": 8.347826086956521e-05, "loss": 0.0795, "step": 2470 }, { "epoch": 1.6588628762541806, "grad_norm": 3.7268786430358887, "learning_rate": 8.34113712374582e-05, "loss": 0.0877, "step": 2480 }, { "epoch": 1.665551839464883, "grad_norm": 2.373908758163452, "learning_rate": 8.334448160535117e-05, "loss": 0.0831, "step": 2490 }, { "epoch": 1.6722408026755853, "grad_norm": 0.46990734338760376, "learning_rate": 8.327759197324416e-05, "loss": 0.0741, "step": 2500 }, { "epoch": 1.6789297658862876, "grad_norm": 1.2849098443984985, "learning_rate": 8.321070234113712e-05, "loss": 0.0728, "step": 2510 }, { "epoch": 1.68561872909699, "grad_norm": 8.4157133102417, "learning_rate": 8.31438127090301e-05, "loss": 0.07, "step": 2520 }, { "epoch": 1.6923076923076923, "grad_norm": 2.4645774364471436, "learning_rate": 8.307692307692309e-05, "loss": 0.0717, "step": 2530 }, { "epoch": 1.6989966555183946, "grad_norm": 1.3757083415985107, "learning_rate": 8.301003344481606e-05, "loss": 0.0652, "step": 2540 }, { "epoch": 1.705685618729097, "grad_norm": 6.6431050300598145, "learning_rate": 8.294314381270904e-05, "loss": 0.0796, "step": 2550 }, { "epoch": 1.7123745819397993, "grad_norm": 0.2695080637931824, "learning_rate": 8.2876254180602e-05, "loss": 0.0724, "step": 2560 }, { "epoch": 1.7190635451505016, "grad_norm": 4.07316255569458, "learning_rate": 8.280936454849499e-05, "loss": 0.0898, "step": 2570 }, { "epoch": 1.725752508361204, "grad_norm": 0.32654353976249695, "learning_rate": 8.274247491638797e-05, "loss": 0.0838, "step": 2580 }, { "epoch": 1.7324414715719063, "grad_norm": 4.416987419128418, "learning_rate": 8.267558528428094e-05, "loss": 0.063, "step": 2590 }, { "epoch": 1.7391304347826086, "grad_norm": 3.2613632678985596, "learning_rate": 8.260869565217392e-05, "loss": 0.0756, "step": 2600 }, { "epoch": 1.745819397993311, "grad_norm": 1.9059773683547974, "learning_rate": 8.254180602006689e-05, "loss": 0.0794, "step": 2610 }, { "epoch": 1.7525083612040135, "grad_norm": 4.8873443603515625, "learning_rate": 8.247491638795987e-05, "loss": 0.0819, "step": 2620 }, { "epoch": 1.7591973244147159, "grad_norm": 1.6874901056289673, "learning_rate": 8.240802675585285e-05, "loss": 0.0772, "step": 2630 }, { "epoch": 1.7658862876254182, "grad_norm": 4.475655555725098, "learning_rate": 8.234113712374582e-05, "loss": 0.0745, "step": 2640 }, { "epoch": 1.7725752508361206, "grad_norm": 1.547790288925171, "learning_rate": 8.22742474916388e-05, "loss": 0.0936, "step": 2650 }, { "epoch": 1.779264214046823, "grad_norm": 2.782203197479248, "learning_rate": 8.220735785953177e-05, "loss": 0.082, "step": 2660 }, { "epoch": 1.7859531772575252, "grad_norm": 0.9756410121917725, "learning_rate": 8.214046822742475e-05, "loss": 0.08, "step": 2670 }, { "epoch": 1.7926421404682276, "grad_norm": 4.201378345489502, "learning_rate": 8.207357859531773e-05, "loss": 0.0856, "step": 2680 }, { "epoch": 1.79933110367893, "grad_norm": 11.08785629272461, "learning_rate": 8.200668896321071e-05, "loss": 0.0786, "step": 2690 }, { "epoch": 1.8060200668896322, "grad_norm": 2.3140785694122314, "learning_rate": 8.193979933110368e-05, "loss": 0.0986, "step": 2700 }, { "epoch": 1.8127090301003346, "grad_norm": 4.409708499908447, "learning_rate": 8.187290969899665e-05, "loss": 0.0768, "step": 2710 }, { "epoch": 1.819397993311037, "grad_norm": 5.257035732269287, "learning_rate": 8.180602006688963e-05, "loss": 0.0616, "step": 2720 }, { "epoch": 1.8260869565217392, "grad_norm": 2.065892457962036, "learning_rate": 8.173913043478262e-05, "loss": 0.086, "step": 2730 }, { "epoch": 1.8327759197324416, "grad_norm": 1.7501225471496582, "learning_rate": 8.16722408026756e-05, "loss": 0.0744, "step": 2740 }, { "epoch": 1.839464882943144, "grad_norm": 7.9652557373046875, "learning_rate": 8.160535117056857e-05, "loss": 0.0991, "step": 2750 }, { "epoch": 1.8461538461538463, "grad_norm": 4.254938125610352, "learning_rate": 8.153846153846155e-05, "loss": 0.0789, "step": 2760 }, { "epoch": 1.8528428093645486, "grad_norm": 0.2958781123161316, "learning_rate": 8.147157190635452e-05, "loss": 0.0733, "step": 2770 }, { "epoch": 1.859531772575251, "grad_norm": 0.6094931364059448, "learning_rate": 8.140468227424748e-05, "loss": 0.079, "step": 2780 }, { "epoch": 1.8662207357859533, "grad_norm": 4.592105388641357, "learning_rate": 8.133779264214048e-05, "loss": 0.0717, "step": 2790 }, { "epoch": 1.8729096989966556, "grad_norm": 0.26445138454437256, "learning_rate": 8.127090301003345e-05, "loss": 0.0809, "step": 2800 }, { "epoch": 1.879598662207358, "grad_norm": 7.2576799392700195, "learning_rate": 8.120401337792643e-05, "loss": 0.064, "step": 2810 }, { "epoch": 1.8862876254180603, "grad_norm": 3.6506426334381104, "learning_rate": 8.11371237458194e-05, "loss": 0.0766, "step": 2820 }, { "epoch": 1.8929765886287626, "grad_norm": 1.1458144187927246, "learning_rate": 8.107023411371238e-05, "loss": 0.0811, "step": 2830 }, { "epoch": 1.899665551839465, "grad_norm": 2.733182191848755, "learning_rate": 8.100334448160536e-05, "loss": 0.0685, "step": 2840 }, { "epoch": 1.9063545150501673, "grad_norm": 2.7656662464141846, "learning_rate": 8.093645484949833e-05, "loss": 0.0772, "step": 2850 }, { "epoch": 1.9130434782608696, "grad_norm": 1.3046529293060303, "learning_rate": 8.086956521739131e-05, "loss": 0.0654, "step": 2860 }, { "epoch": 1.919732441471572, "grad_norm": 0.3139551281929016, "learning_rate": 8.080267558528428e-05, "loss": 0.0864, "step": 2870 }, { "epoch": 1.9264214046822743, "grad_norm": 0.244270458817482, "learning_rate": 8.073578595317726e-05, "loss": 0.0698, "step": 2880 }, { "epoch": 1.9331103678929766, "grad_norm": 1.8901079893112183, "learning_rate": 8.066889632107024e-05, "loss": 0.0912, "step": 2890 }, { "epoch": 1.939799331103679, "grad_norm": 3.75931715965271, "learning_rate": 8.060200668896321e-05, "loss": 0.0708, "step": 2900 }, { "epoch": 1.9464882943143813, "grad_norm": 3.791531801223755, "learning_rate": 8.053511705685619e-05, "loss": 0.0808, "step": 2910 }, { "epoch": 1.9531772575250836, "grad_norm": 9.437219619750977, "learning_rate": 8.046822742474916e-05, "loss": 0.1003, "step": 2920 }, { "epoch": 1.959866220735786, "grad_norm": 0.38832196593284607, "learning_rate": 8.040133779264214e-05, "loss": 0.0823, "step": 2930 }, { "epoch": 1.9665551839464883, "grad_norm": 4.084001541137695, "learning_rate": 8.033444816053512e-05, "loss": 0.0799, "step": 2940 }, { "epoch": 1.9732441471571907, "grad_norm": 6.1558837890625, "learning_rate": 8.026755852842809e-05, "loss": 0.0936, "step": 2950 }, { "epoch": 1.979933110367893, "grad_norm": 8.414342880249023, "learning_rate": 8.020066889632107e-05, "loss": 0.0786, "step": 2960 }, { "epoch": 1.9866220735785953, "grad_norm": 2.3542683124542236, "learning_rate": 8.013377926421404e-05, "loss": 0.0725, "step": 2970 }, { "epoch": 1.9933110367892977, "grad_norm": 0.4237585961818695, "learning_rate": 8.006688963210702e-05, "loss": 0.0882, "step": 2980 }, { "epoch": 2.0, "grad_norm": 2.528468132019043, "learning_rate": 8e-05, "loss": 0.0833, "step": 2990 }, { "epoch": 2.0, "eval_loss": 0.0793251246213913, "eval_mse": 0.0793251246213913, "eval_runtime": 230.277, "eval_samples_per_second": 12.984, "eval_steps_per_second": 1.624, "step": 2990 }, { "epoch": 2.0066889632107023, "grad_norm": 0.42202022671699524, "learning_rate": 7.993311036789299e-05, "loss": 0.0608, "step": 3000 }, { "epoch": 2.0133779264214047, "grad_norm": 2.761110544204712, "learning_rate": 7.986622073578596e-05, "loss": 0.0771, "step": 3010 }, { "epoch": 2.020066889632107, "grad_norm": 0.31655821204185486, "learning_rate": 7.979933110367892e-05, "loss": 0.0668, "step": 3020 }, { "epoch": 2.0267558528428093, "grad_norm": 0.8060646653175354, "learning_rate": 7.97324414715719e-05, "loss": 0.1023, "step": 3030 }, { "epoch": 2.0334448160535117, "grad_norm": 4.09045934677124, "learning_rate": 7.966555183946489e-05, "loss": 0.0823, "step": 3040 }, { "epoch": 2.040133779264214, "grad_norm": 3.433368682861328, "learning_rate": 7.959866220735787e-05, "loss": 0.0775, "step": 3050 }, { "epoch": 2.0468227424749164, "grad_norm": 2.6346817016601562, "learning_rate": 7.953177257525084e-05, "loss": 0.0725, "step": 3060 }, { "epoch": 2.0535117056856187, "grad_norm": 4.599377632141113, "learning_rate": 7.946488294314382e-05, "loss": 0.0823, "step": 3070 }, { "epoch": 2.060200668896321, "grad_norm": 4.843750476837158, "learning_rate": 7.939799331103679e-05, "loss": 0.0682, "step": 3080 }, { "epoch": 2.0668896321070234, "grad_norm": 3.9601993560791016, "learning_rate": 7.933110367892977e-05, "loss": 0.0775, "step": 3090 }, { "epoch": 2.0735785953177257, "grad_norm": 0.437264084815979, "learning_rate": 7.926421404682275e-05, "loss": 0.0603, "step": 3100 }, { "epoch": 2.080267558528428, "grad_norm": 2.8297762870788574, "learning_rate": 7.919732441471572e-05, "loss": 0.0815, "step": 3110 }, { "epoch": 2.0869565217391304, "grad_norm": 1.021713137626648, "learning_rate": 7.91304347826087e-05, "loss": 0.0575, "step": 3120 }, { "epoch": 2.0936454849498327, "grad_norm": 0.2781499922275543, "learning_rate": 7.906354515050167e-05, "loss": 0.0761, "step": 3130 }, { "epoch": 2.100334448160535, "grad_norm": 1.3915780782699585, "learning_rate": 7.899665551839465e-05, "loss": 0.0688, "step": 3140 }, { "epoch": 2.1070234113712374, "grad_norm": 1.546858787536621, "learning_rate": 7.892976588628763e-05, "loss": 0.0762, "step": 3150 }, { "epoch": 2.1137123745819397, "grad_norm": 3.6546237468719482, "learning_rate": 7.88628762541806e-05, "loss": 0.0569, "step": 3160 }, { "epoch": 2.120401337792642, "grad_norm": 3.8396148681640625, "learning_rate": 7.879598662207358e-05, "loss": 0.0783, "step": 3170 }, { "epoch": 2.1270903010033444, "grad_norm": 1.614401936531067, "learning_rate": 7.872909698996655e-05, "loss": 0.0782, "step": 3180 }, { "epoch": 2.1337792642140467, "grad_norm": 1.955979347229004, "learning_rate": 7.866220735785953e-05, "loss": 0.0659, "step": 3190 }, { "epoch": 2.140468227424749, "grad_norm": 10.376070976257324, "learning_rate": 7.859531772575252e-05, "loss": 0.0948, "step": 3200 }, { "epoch": 2.1471571906354514, "grad_norm": 0.34439998865127563, "learning_rate": 7.852842809364548e-05, "loss": 0.1018, "step": 3210 }, { "epoch": 2.1538461538461537, "grad_norm": 5.3788862228393555, "learning_rate": 7.846153846153847e-05, "loss": 0.0891, "step": 3220 }, { "epoch": 2.160535117056856, "grad_norm": 1.8117215633392334, "learning_rate": 7.839464882943143e-05, "loss": 0.0854, "step": 3230 }, { "epoch": 2.1672240802675584, "grad_norm": 2.610339403152466, "learning_rate": 7.832775919732442e-05, "loss": 0.0753, "step": 3240 }, { "epoch": 2.1739130434782608, "grad_norm": 1.4385907649993896, "learning_rate": 7.82608695652174e-05, "loss": 0.0785, "step": 3250 }, { "epoch": 2.180602006688963, "grad_norm": 6.955935001373291, "learning_rate": 7.819397993311037e-05, "loss": 0.074, "step": 3260 }, { "epoch": 2.1872909698996654, "grad_norm": 1.2500567436218262, "learning_rate": 7.812709030100335e-05, "loss": 0.0788, "step": 3270 }, { "epoch": 2.1939799331103678, "grad_norm": 3.9600324630737305, "learning_rate": 7.806020066889632e-05, "loss": 0.0749, "step": 3280 }, { "epoch": 2.20066889632107, "grad_norm": 7.548783302307129, "learning_rate": 7.79933110367893e-05, "loss": 0.0903, "step": 3290 }, { "epoch": 2.2073578595317724, "grad_norm": 0.5083264708518982, "learning_rate": 7.792642140468228e-05, "loss": 0.07, "step": 3300 }, { "epoch": 2.2140468227424748, "grad_norm": 0.17018166184425354, "learning_rate": 7.785953177257526e-05, "loss": 0.0867, "step": 3310 }, { "epoch": 2.220735785953177, "grad_norm": 1.3731433153152466, "learning_rate": 7.779264214046823e-05, "loss": 0.0653, "step": 3320 }, { "epoch": 2.2274247491638794, "grad_norm": 1.8665434122085571, "learning_rate": 7.77257525083612e-05, "loss": 0.0951, "step": 3330 }, { "epoch": 2.234113712374582, "grad_norm": 4.514686107635498, "learning_rate": 7.765886287625418e-05, "loss": 0.0513, "step": 3340 }, { "epoch": 2.240802675585284, "grad_norm": 0.6165162324905396, "learning_rate": 7.759197324414716e-05, "loss": 0.0676, "step": 3350 }, { "epoch": 2.2474916387959865, "grad_norm": 1.839833378791809, "learning_rate": 7.752508361204014e-05, "loss": 0.1025, "step": 3360 }, { "epoch": 2.254180602006689, "grad_norm": 2.2654104232788086, "learning_rate": 7.745819397993311e-05, "loss": 0.0821, "step": 3370 }, { "epoch": 2.260869565217391, "grad_norm": 6.743686199188232, "learning_rate": 7.73913043478261e-05, "loss": 0.0741, "step": 3380 }, { "epoch": 2.2675585284280935, "grad_norm": 3.1228573322296143, "learning_rate": 7.732441471571906e-05, "loss": 0.0797, "step": 3390 }, { "epoch": 2.274247491638796, "grad_norm": 0.12537412345409393, "learning_rate": 7.725752508361204e-05, "loss": 0.0856, "step": 3400 }, { "epoch": 2.280936454849498, "grad_norm": 1.829524278640747, "learning_rate": 7.719063545150503e-05, "loss": 0.084, "step": 3410 }, { "epoch": 2.2876254180602005, "grad_norm": 3.228325366973877, "learning_rate": 7.7123745819398e-05, "loss": 0.0894, "step": 3420 }, { "epoch": 2.294314381270903, "grad_norm": 4.834877014160156, "learning_rate": 7.705685618729098e-05, "loss": 0.0778, "step": 3430 }, { "epoch": 2.3010033444816056, "grad_norm": 4.732447624206543, "learning_rate": 7.698996655518394e-05, "loss": 0.1107, "step": 3440 }, { "epoch": 2.3076923076923075, "grad_norm": 2.6647109985351562, "learning_rate": 7.692307692307693e-05, "loss": 0.0858, "step": 3450 }, { "epoch": 2.3143812709030103, "grad_norm": 4.5095415115356445, "learning_rate": 7.685618729096991e-05, "loss": 0.0825, "step": 3460 }, { "epoch": 2.321070234113712, "grad_norm": 1.0812541246414185, "learning_rate": 7.678929765886288e-05, "loss": 0.081, "step": 3470 }, { "epoch": 2.327759197324415, "grad_norm": 7.102626800537109, "learning_rate": 7.672240802675586e-05, "loss": 0.0928, "step": 3480 }, { "epoch": 2.334448160535117, "grad_norm": 4.619367599487305, "learning_rate": 7.665551839464883e-05, "loss": 0.0918, "step": 3490 }, { "epoch": 2.3411371237458196, "grad_norm": 3.246306896209717, "learning_rate": 7.658862876254181e-05, "loss": 0.079, "step": 3500 }, { "epoch": 2.3478260869565215, "grad_norm": 4.399368762969971, "learning_rate": 7.652173913043479e-05, "loss": 0.0943, "step": 3510 }, { "epoch": 2.3545150501672243, "grad_norm": 1.4770549535751343, "learning_rate": 7.645484949832776e-05, "loss": 0.0861, "step": 3520 }, { "epoch": 2.361204013377926, "grad_norm": 3.9449102878570557, "learning_rate": 7.638795986622074e-05, "loss": 0.082, "step": 3530 }, { "epoch": 2.367892976588629, "grad_norm": 3.8862507343292236, "learning_rate": 7.632107023411371e-05, "loss": 0.0703, "step": 3540 }, { "epoch": 2.374581939799331, "grad_norm": 3.4087064266204834, "learning_rate": 7.62541806020067e-05, "loss": 0.0915, "step": 3550 }, { "epoch": 2.3812709030100336, "grad_norm": 0.5102267265319824, "learning_rate": 7.618729096989967e-05, "loss": 0.094, "step": 3560 }, { "epoch": 2.387959866220736, "grad_norm": 1.3995593786239624, "learning_rate": 7.612040133779264e-05, "loss": 0.0895, "step": 3570 }, { "epoch": 2.3946488294314383, "grad_norm": 1.626395583152771, "learning_rate": 7.605351170568562e-05, "loss": 0.0999, "step": 3580 }, { "epoch": 2.4013377926421406, "grad_norm": 1.7516860961914062, "learning_rate": 7.598662207357859e-05, "loss": 0.0853, "step": 3590 }, { "epoch": 2.408026755852843, "grad_norm": 5.477409362792969, "learning_rate": 7.591973244147159e-05, "loss": 0.0775, "step": 3600 }, { "epoch": 2.4147157190635453, "grad_norm": 4.511748313903809, "learning_rate": 7.585284280936455e-05, "loss": 0.0806, "step": 3610 }, { "epoch": 2.4214046822742477, "grad_norm": 3.8276188373565674, "learning_rate": 7.578595317725754e-05, "loss": 0.0785, "step": 3620 }, { "epoch": 2.42809364548495, "grad_norm": 3.8297007083892822, "learning_rate": 7.57190635451505e-05, "loss": 0.0809, "step": 3630 }, { "epoch": 2.4347826086956523, "grad_norm": 3.4355835914611816, "learning_rate": 7.565217391304347e-05, "loss": 0.0806, "step": 3640 }, { "epoch": 2.4414715719063547, "grad_norm": 5.955551624298096, "learning_rate": 7.558528428093647e-05, "loss": 0.076, "step": 3650 }, { "epoch": 2.448160535117057, "grad_norm": 0.3564576804637909, "learning_rate": 7.551839464882944e-05, "loss": 0.0885, "step": 3660 }, { "epoch": 2.4548494983277593, "grad_norm": 0.8594129681587219, "learning_rate": 7.545150501672242e-05, "loss": 0.0723, "step": 3670 }, { "epoch": 2.4615384615384617, "grad_norm": 5.289486408233643, "learning_rate": 7.538461538461539e-05, "loss": 0.1042, "step": 3680 }, { "epoch": 2.468227424749164, "grad_norm": 3.3944950103759766, "learning_rate": 7.531772575250837e-05, "loss": 0.0716, "step": 3690 }, { "epoch": 2.4749163879598663, "grad_norm": 2.454094171524048, "learning_rate": 7.525083612040135e-05, "loss": 0.0709, "step": 3700 }, { "epoch": 2.4816053511705687, "grad_norm": 0.41697606444358826, "learning_rate": 7.518394648829432e-05, "loss": 0.101, "step": 3710 }, { "epoch": 2.488294314381271, "grad_norm": 3.9693078994750977, "learning_rate": 7.51170568561873e-05, "loss": 0.0779, "step": 3720 }, { "epoch": 2.4949832775919734, "grad_norm": 4.420345306396484, "learning_rate": 7.505016722408027e-05, "loss": 0.0738, "step": 3730 }, { "epoch": 2.5016722408026757, "grad_norm": 1.1933796405792236, "learning_rate": 7.498327759197325e-05, "loss": 0.085, "step": 3740 }, { "epoch": 2.508361204013378, "grad_norm": 0.7892850637435913, "learning_rate": 7.491638795986622e-05, "loss": 0.0715, "step": 3750 }, { "epoch": 2.5150501672240804, "grad_norm": 5.415370941162109, "learning_rate": 7.48494983277592e-05, "loss": 0.0833, "step": 3760 }, { "epoch": 2.5217391304347827, "grad_norm": 2.976743698120117, "learning_rate": 7.478260869565218e-05, "loss": 0.0747, "step": 3770 }, { "epoch": 2.528428093645485, "grad_norm": 4.22029972076416, "learning_rate": 7.471571906354515e-05, "loss": 0.0747, "step": 3780 }, { "epoch": 2.5351170568561874, "grad_norm": 0.27683568000793457, "learning_rate": 7.464882943143813e-05, "loss": 0.0635, "step": 3790 }, { "epoch": 2.5418060200668897, "grad_norm": 3.5258395671844482, "learning_rate": 7.45819397993311e-05, "loss": 0.0847, "step": 3800 }, { "epoch": 2.548494983277592, "grad_norm": 1.4285045862197876, "learning_rate": 7.451505016722408e-05, "loss": 0.0561, "step": 3810 }, { "epoch": 2.5551839464882944, "grad_norm": 3.0891616344451904, "learning_rate": 7.444816053511706e-05, "loss": 0.0738, "step": 3820 }, { "epoch": 2.5618729096989967, "grad_norm": 2.099480628967285, "learning_rate": 7.438127090301003e-05, "loss": 0.1008, "step": 3830 }, { "epoch": 2.568561872909699, "grad_norm": 0.7076812982559204, "learning_rate": 7.431438127090301e-05, "loss": 0.0709, "step": 3840 }, { "epoch": 2.5752508361204014, "grad_norm": 4.342235088348389, "learning_rate": 7.424749163879598e-05, "loss": 0.0722, "step": 3850 }, { "epoch": 2.5819397993311037, "grad_norm": 2.153355360031128, "learning_rate": 7.418060200668898e-05, "loss": 0.0706, "step": 3860 }, { "epoch": 2.588628762541806, "grad_norm": 2.8034729957580566, "learning_rate": 7.411371237458194e-05, "loss": 0.0801, "step": 3870 }, { "epoch": 2.5953177257525084, "grad_norm": 2.267157554626465, "learning_rate": 7.404682274247491e-05, "loss": 0.0922, "step": 3880 }, { "epoch": 2.6020066889632107, "grad_norm": 2.9926929473876953, "learning_rate": 7.39799331103679e-05, "loss": 0.0845, "step": 3890 }, { "epoch": 2.608695652173913, "grad_norm": 3.7677009105682373, "learning_rate": 7.391304347826086e-05, "loss": 0.0769, "step": 3900 }, { "epoch": 2.6153846153846154, "grad_norm": 6.213174819946289, "learning_rate": 7.384615384615386e-05, "loss": 0.092, "step": 3910 }, { "epoch": 2.6220735785953178, "grad_norm": 0.17510397732257843, "learning_rate": 7.377926421404683e-05, "loss": 0.0758, "step": 3920 }, { "epoch": 2.62876254180602, "grad_norm": 0.8683602213859558, "learning_rate": 7.371237458193981e-05, "loss": 0.0888, "step": 3930 }, { "epoch": 2.6354515050167224, "grad_norm": 2.76550030708313, "learning_rate": 7.364548494983278e-05, "loss": 0.0666, "step": 3940 }, { "epoch": 2.6421404682274248, "grad_norm": 0.7504422068595886, "learning_rate": 7.357859531772575e-05, "loss": 0.0674, "step": 3950 }, { "epoch": 2.648829431438127, "grad_norm": 1.3935351371765137, "learning_rate": 7.351170568561874e-05, "loss": 0.0858, "step": 3960 }, { "epoch": 2.6555183946488294, "grad_norm": 1.4868053197860718, "learning_rate": 7.344481605351171e-05, "loss": 0.0853, "step": 3970 }, { "epoch": 2.6622073578595318, "grad_norm": 4.424736976623535, "learning_rate": 7.337792642140469e-05, "loss": 0.0801, "step": 3980 }, { "epoch": 2.668896321070234, "grad_norm": 2.0431973934173584, "learning_rate": 7.331103678929766e-05, "loss": 0.0769, "step": 3990 }, { "epoch": 2.6755852842809364, "grad_norm": 1.8978101015090942, "learning_rate": 7.324414715719064e-05, "loss": 0.0883, "step": 4000 }, { "epoch": 2.682274247491639, "grad_norm": 3.53125262260437, "learning_rate": 7.317725752508362e-05, "loss": 0.0925, "step": 4010 }, { "epoch": 2.688963210702341, "grad_norm": 1.823233962059021, "learning_rate": 7.311036789297659e-05, "loss": 0.0686, "step": 4020 }, { "epoch": 2.6956521739130435, "grad_norm": 0.9586164951324463, "learning_rate": 7.304347826086957e-05, "loss": 0.0779, "step": 4030 }, { "epoch": 2.702341137123746, "grad_norm": 3.304990768432617, "learning_rate": 7.297658862876254e-05, "loss": 0.0716, "step": 4040 }, { "epoch": 2.709030100334448, "grad_norm": 5.012158393859863, "learning_rate": 7.290969899665552e-05, "loss": 0.0733, "step": 4050 }, { "epoch": 2.7157190635451505, "grad_norm": 4.342820167541504, "learning_rate": 7.28428093645485e-05, "loss": 0.0782, "step": 4060 }, { "epoch": 2.722408026755853, "grad_norm": 4.751095294952393, "learning_rate": 7.277591973244147e-05, "loss": 0.0687, "step": 4070 }, { "epoch": 2.729096989966555, "grad_norm": 1.3369343280792236, "learning_rate": 7.270903010033445e-05, "loss": 0.0693, "step": 4080 }, { "epoch": 2.7357859531772575, "grad_norm": 2.1024954319000244, "learning_rate": 7.264214046822742e-05, "loss": 0.0776, "step": 4090 }, { "epoch": 2.74247491638796, "grad_norm": 8.521135330200195, "learning_rate": 7.25752508361204e-05, "loss": 0.0761, "step": 4100 }, { "epoch": 2.749163879598662, "grad_norm": 1.152764916419983, "learning_rate": 7.250836120401339e-05, "loss": 0.0959, "step": 4110 }, { "epoch": 2.7558528428093645, "grad_norm": 2.0863466262817383, "learning_rate": 7.244147157190635e-05, "loss": 0.067, "step": 4120 }, { "epoch": 2.762541806020067, "grad_norm": 0.3307766318321228, "learning_rate": 7.237458193979934e-05, "loss": 0.0771, "step": 4130 }, { "epoch": 2.769230769230769, "grad_norm": 1.6041626930236816, "learning_rate": 7.23076923076923e-05, "loss": 0.0711, "step": 4140 }, { "epoch": 2.7759197324414715, "grad_norm": 0.37076133489608765, "learning_rate": 7.224080267558529e-05, "loss": 0.0692, "step": 4150 }, { "epoch": 2.782608695652174, "grad_norm": 1.4220796823501587, "learning_rate": 7.217391304347827e-05, "loss": 0.0867, "step": 4160 }, { "epoch": 2.789297658862876, "grad_norm": 6.7247090339660645, "learning_rate": 7.210702341137125e-05, "loss": 0.0932, "step": 4170 }, { "epoch": 2.7959866220735785, "grad_norm": 0.6634153127670288, "learning_rate": 7.204013377926422e-05, "loss": 0.077, "step": 4180 }, { "epoch": 2.802675585284281, "grad_norm": 1.5781530141830444, "learning_rate": 7.197324414715719e-05, "loss": 0.0726, "step": 4190 }, { "epoch": 2.809364548494983, "grad_norm": 1.701782464981079, "learning_rate": 7.190635451505017e-05, "loss": 0.0616, "step": 4200 }, { "epoch": 2.8160535117056855, "grad_norm": 2.859126329421997, "learning_rate": 7.183946488294315e-05, "loss": 0.088, "step": 4210 }, { "epoch": 2.822742474916388, "grad_norm": 1.8869216442108154, "learning_rate": 7.177257525083613e-05, "loss": 0.0766, "step": 4220 }, { "epoch": 2.82943143812709, "grad_norm": 0.26754462718963623, "learning_rate": 7.17056856187291e-05, "loss": 0.0771, "step": 4230 }, { "epoch": 2.8361204013377925, "grad_norm": 0.4016328454017639, "learning_rate": 7.163879598662208e-05, "loss": 0.0887, "step": 4240 }, { "epoch": 2.842809364548495, "grad_norm": 3.2005503177642822, "learning_rate": 7.157190635451505e-05, "loss": 0.0742, "step": 4250 }, { "epoch": 2.849498327759197, "grad_norm": 3.5342941284179688, "learning_rate": 7.150501672240802e-05, "loss": 0.084, "step": 4260 }, { "epoch": 2.8561872909698995, "grad_norm": 1.4917818307876587, "learning_rate": 7.143812709030101e-05, "loss": 0.0885, "step": 4270 }, { "epoch": 2.862876254180602, "grad_norm": 4.428409099578857, "learning_rate": 7.137123745819398e-05, "loss": 0.0918, "step": 4280 }, { "epoch": 2.869565217391304, "grad_norm": 1.8964093923568726, "learning_rate": 7.130434782608696e-05, "loss": 0.0719, "step": 4290 }, { "epoch": 2.8762541806020065, "grad_norm": 0.6107605695724487, "learning_rate": 7.123745819397993e-05, "loss": 0.0815, "step": 4300 }, { "epoch": 2.882943143812709, "grad_norm": 0.24433688819408417, "learning_rate": 7.117056856187291e-05, "loss": 0.0829, "step": 4310 }, { "epoch": 2.8896321070234112, "grad_norm": 7.567927360534668, "learning_rate": 7.11036789297659e-05, "loss": 0.0786, "step": 4320 }, { "epoch": 2.8963210702341136, "grad_norm": 2.2083115577697754, "learning_rate": 7.103678929765886e-05, "loss": 0.0766, "step": 4330 }, { "epoch": 2.903010033444816, "grad_norm": 0.35634785890579224, "learning_rate": 7.096989966555185e-05, "loss": 0.0723, "step": 4340 }, { "epoch": 2.9096989966555182, "grad_norm": 0.25455230474472046, "learning_rate": 7.090301003344481e-05, "loss": 0.0866, "step": 4350 }, { "epoch": 2.9163879598662206, "grad_norm": 3.7779791355133057, "learning_rate": 7.08361204013378e-05, "loss": 0.0655, "step": 4360 }, { "epoch": 2.9230769230769234, "grad_norm": 8.164266586303711, "learning_rate": 7.076923076923078e-05, "loss": 0.0778, "step": 4370 }, { "epoch": 2.9297658862876252, "grad_norm": 1.9915484189987183, "learning_rate": 7.070234113712375e-05, "loss": 0.0705, "step": 4380 }, { "epoch": 2.936454849498328, "grad_norm": 0.46880605816841125, "learning_rate": 7.063545150501673e-05, "loss": 0.0749, "step": 4390 }, { "epoch": 2.94314381270903, "grad_norm": 0.22903496026992798, "learning_rate": 7.05685618729097e-05, "loss": 0.0599, "step": 4400 }, { "epoch": 2.9498327759197327, "grad_norm": 0.978259265422821, "learning_rate": 7.050167224080268e-05, "loss": 0.0745, "step": 4410 }, { "epoch": 2.9565217391304346, "grad_norm": 2.0547633171081543, "learning_rate": 7.043478260869566e-05, "loss": 0.0722, "step": 4420 }, { "epoch": 2.9632107023411374, "grad_norm": 3.652096748352051, "learning_rate": 7.036789297658863e-05, "loss": 0.0871, "step": 4430 }, { "epoch": 2.9698996655518393, "grad_norm": 3.154127836227417, "learning_rate": 7.030100334448161e-05, "loss": 0.0756, "step": 4440 }, { "epoch": 2.976588628762542, "grad_norm": 0.7841221690177917, "learning_rate": 7.023411371237458e-05, "loss": 0.0825, "step": 4450 }, { "epoch": 2.983277591973244, "grad_norm": 0.5690768957138062, "learning_rate": 7.016722408026756e-05, "loss": 0.0615, "step": 4460 }, { "epoch": 2.9899665551839467, "grad_norm": 4.73695182800293, "learning_rate": 7.010033444816054e-05, "loss": 0.0763, "step": 4470 }, { "epoch": 2.9966555183946486, "grad_norm": 3.4800970554351807, "learning_rate": 7.003344481605352e-05, "loss": 0.074, "step": 4480 }, { "epoch": 3.0, "eval_loss": 0.08599434792995453, "eval_mse": 0.08599434792995453, "eval_runtime": 220.1132, "eval_samples_per_second": 13.584, "eval_steps_per_second": 1.699, "step": 4485 }, { "epoch": 3.0033444816053514, "grad_norm": 0.4188951551914215, "learning_rate": 6.996655518394649e-05, "loss": 0.0636, "step": 4490 }, { "epoch": 3.0100334448160537, "grad_norm": 0.22950677573680878, "learning_rate": 6.989966555183946e-05, "loss": 0.0741, "step": 4500 }, { "epoch": 3.016722408026756, "grad_norm": 5.341685771942139, "learning_rate": 6.983277591973244e-05, "loss": 0.0804, "step": 4510 }, { "epoch": 3.0234113712374584, "grad_norm": 3.4862024784088135, "learning_rate": 6.976588628762542e-05, "loss": 0.0872, "step": 4520 }, { "epoch": 3.0301003344481607, "grad_norm": 0.4761682450771332, "learning_rate": 6.96989966555184e-05, "loss": 0.0795, "step": 4530 }, { "epoch": 3.036789297658863, "grad_norm": 0.49775540828704834, "learning_rate": 6.963210702341137e-05, "loss": 0.0818, "step": 4540 }, { "epoch": 3.0434782608695654, "grad_norm": 2.6934797763824463, "learning_rate": 6.956521739130436e-05, "loss": 0.0588, "step": 4550 }, { "epoch": 3.0501672240802677, "grad_norm": 1.939240574836731, "learning_rate": 6.949832775919732e-05, "loss": 0.0835, "step": 4560 }, { "epoch": 3.05685618729097, "grad_norm": 0.32804200053215027, "learning_rate": 6.94314381270903e-05, "loss": 0.0583, "step": 4570 }, { "epoch": 3.0635451505016724, "grad_norm": 3.560800075531006, "learning_rate": 6.936454849498329e-05, "loss": 0.0757, "step": 4580 }, { "epoch": 3.0702341137123748, "grad_norm": 5.4448065757751465, "learning_rate": 6.929765886287626e-05, "loss": 0.0695, "step": 4590 }, { "epoch": 3.076923076923077, "grad_norm": 3.402082920074463, "learning_rate": 6.923076923076924e-05, "loss": 0.0689, "step": 4600 }, { "epoch": 3.0836120401337794, "grad_norm": 1.682234764099121, "learning_rate": 6.91638795986622e-05, "loss": 0.069, "step": 4610 }, { "epoch": 3.0903010033444818, "grad_norm": 2.9638731479644775, "learning_rate": 6.909698996655519e-05, "loss": 0.0621, "step": 4620 }, { "epoch": 3.096989966555184, "grad_norm": 2.818448543548584, "learning_rate": 6.903010033444817e-05, "loss": 0.0723, "step": 4630 }, { "epoch": 3.1036789297658864, "grad_norm": 2.1265201568603516, "learning_rate": 6.896321070234114e-05, "loss": 0.1035, "step": 4640 }, { "epoch": 3.1103678929765888, "grad_norm": 2.025054931640625, "learning_rate": 6.889632107023412e-05, "loss": 0.0769, "step": 4650 }, { "epoch": 3.117056856187291, "grad_norm": 3.5865674018859863, "learning_rate": 6.882943143812709e-05, "loss": 0.0784, "step": 4660 }, { "epoch": 3.1237458193979935, "grad_norm": 4.239811420440674, "learning_rate": 6.876254180602007e-05, "loss": 0.0805, "step": 4670 }, { "epoch": 3.130434782608696, "grad_norm": 8.931589126586914, "learning_rate": 6.869565217391305e-05, "loss": 0.0751, "step": 4680 }, { "epoch": 3.137123745819398, "grad_norm": 6.283422470092773, "learning_rate": 6.862876254180602e-05, "loss": 0.0885, "step": 4690 }, { "epoch": 3.1438127090301005, "grad_norm": 6.964988708496094, "learning_rate": 6.8561872909699e-05, "loss": 0.0881, "step": 4700 }, { "epoch": 3.150501672240803, "grad_norm": 2.8225507736206055, "learning_rate": 6.849498327759197e-05, "loss": 0.0722, "step": 4710 }, { "epoch": 3.157190635451505, "grad_norm": 0.5449556708335876, "learning_rate": 6.842809364548496e-05, "loss": 0.0861, "step": 4720 }, { "epoch": 3.1638795986622075, "grad_norm": 4.718214511871338, "learning_rate": 6.836120401337793e-05, "loss": 0.085, "step": 4730 }, { "epoch": 3.17056856187291, "grad_norm": 3.2649168968200684, "learning_rate": 6.82943143812709e-05, "loss": 0.0658, "step": 4740 }, { "epoch": 3.177257525083612, "grad_norm": 1.1325840950012207, "learning_rate": 6.822742474916388e-05, "loss": 0.0711, "step": 4750 }, { "epoch": 3.1839464882943145, "grad_norm": 1.378689169883728, "learning_rate": 6.816053511705685e-05, "loss": 0.0538, "step": 4760 }, { "epoch": 3.190635451505017, "grad_norm": 2.0602452754974365, "learning_rate": 6.809364548494983e-05, "loss": 0.0746, "step": 4770 }, { "epoch": 3.197324414715719, "grad_norm": 5.678889274597168, "learning_rate": 6.802675585284281e-05, "loss": 0.0891, "step": 4780 }, { "epoch": 3.2040133779264215, "grad_norm": 1.952165126800537, "learning_rate": 6.79598662207358e-05, "loss": 0.0844, "step": 4790 }, { "epoch": 3.210702341137124, "grad_norm": 2.186636447906494, "learning_rate": 6.789297658862876e-05, "loss": 0.063, "step": 4800 }, { "epoch": 3.217391304347826, "grad_norm": 0.32168132066726685, "learning_rate": 6.782608695652173e-05, "loss": 0.0873, "step": 4810 }, { "epoch": 3.2240802675585285, "grad_norm": 4.04616641998291, "learning_rate": 6.775919732441471e-05, "loss": 0.0696, "step": 4820 }, { "epoch": 3.230769230769231, "grad_norm": 1.4718605279922485, "learning_rate": 6.76923076923077e-05, "loss": 0.0719, "step": 4830 }, { "epoch": 3.237458193979933, "grad_norm": 3.9757933616638184, "learning_rate": 6.762541806020068e-05, "loss": 0.0888, "step": 4840 }, { "epoch": 3.2441471571906355, "grad_norm": 0.3360963463783264, "learning_rate": 6.755852842809365e-05, "loss": 0.0789, "step": 4850 }, { "epoch": 3.250836120401338, "grad_norm": 3.2653157711029053, "learning_rate": 6.749163879598663e-05, "loss": 0.0757, "step": 4860 }, { "epoch": 3.25752508361204, "grad_norm": 1.1410871744155884, "learning_rate": 6.74247491638796e-05, "loss": 0.0675, "step": 4870 }, { "epoch": 3.2642140468227425, "grad_norm": 2.160099506378174, "learning_rate": 6.735785953177258e-05, "loss": 0.0696, "step": 4880 }, { "epoch": 3.270903010033445, "grad_norm": 3.953361749649048, "learning_rate": 6.729096989966556e-05, "loss": 0.0761, "step": 4890 }, { "epoch": 3.277591973244147, "grad_norm": 2.248314380645752, "learning_rate": 6.722408026755853e-05, "loss": 0.0819, "step": 4900 }, { "epoch": 3.2842809364548495, "grad_norm": 1.036260724067688, "learning_rate": 6.715719063545151e-05, "loss": 0.0853, "step": 4910 }, { "epoch": 3.290969899665552, "grad_norm": 1.3433880805969238, "learning_rate": 6.709030100334448e-05, "loss": 0.0715, "step": 4920 }, { "epoch": 3.297658862876254, "grad_norm": 2.211595296859741, "learning_rate": 6.702341137123746e-05, "loss": 0.0541, "step": 4930 }, { "epoch": 3.3043478260869565, "grad_norm": 6.310031414031982, "learning_rate": 6.695652173913044e-05, "loss": 0.0765, "step": 4940 }, { "epoch": 3.311036789297659, "grad_norm": 1.9298906326293945, "learning_rate": 6.688963210702341e-05, "loss": 0.0766, "step": 4950 }, { "epoch": 3.317725752508361, "grad_norm": 0.2267463505268097, "learning_rate": 6.682274247491639e-05, "loss": 0.0627, "step": 4960 }, { "epoch": 3.3244147157190636, "grad_norm": 0.6416743993759155, "learning_rate": 6.675585284280936e-05, "loss": 0.0775, "step": 4970 }, { "epoch": 3.331103678929766, "grad_norm": 3.208157539367676, "learning_rate": 6.668896321070234e-05, "loss": 0.0783, "step": 4980 }, { "epoch": 3.3377926421404682, "grad_norm": 1.661583423614502, "learning_rate": 6.662207357859532e-05, "loss": 0.1092, "step": 4990 }, { "epoch": 3.3444816053511706, "grad_norm": 0.8186647295951843, "learning_rate": 6.655518394648829e-05, "loss": 0.0831, "step": 5000 }, { "epoch": 3.351170568561873, "grad_norm": 7.43472957611084, "learning_rate": 6.648829431438127e-05, "loss": 0.0948, "step": 5010 }, { "epoch": 3.3578595317725752, "grad_norm": 1.8813217878341675, "learning_rate": 6.642140468227424e-05, "loss": 0.076, "step": 5020 }, { "epoch": 3.3645484949832776, "grad_norm": 3.208296060562134, "learning_rate": 6.635451505016724e-05, "loss": 0.0695, "step": 5030 }, { "epoch": 3.37123745819398, "grad_norm": 1.160740613937378, "learning_rate": 6.62876254180602e-05, "loss": 0.0713, "step": 5040 }, { "epoch": 3.3779264214046822, "grad_norm": 4.097474575042725, "learning_rate": 6.622073578595317e-05, "loss": 0.0714, "step": 5050 }, { "epoch": 3.3846153846153846, "grad_norm": 1.559029221534729, "learning_rate": 6.615384615384616e-05, "loss": 0.0682, "step": 5060 }, { "epoch": 3.391304347826087, "grad_norm": 0.24774087965488434, "learning_rate": 6.608695652173912e-05, "loss": 0.0797, "step": 5070 }, { "epoch": 3.3979933110367893, "grad_norm": 3.935304880142212, "learning_rate": 6.602006688963212e-05, "loss": 0.0659, "step": 5080 }, { "epoch": 3.4046822742474916, "grad_norm": 1.9889538288116455, "learning_rate": 6.595317725752509e-05, "loss": 0.0628, "step": 5090 }, { "epoch": 3.411371237458194, "grad_norm": 6.523388385772705, "learning_rate": 6.588628762541807e-05, "loss": 0.0844, "step": 5100 }, { "epoch": 3.4180602006688963, "grad_norm": 0.7219478487968445, "learning_rate": 6.581939799331104e-05, "loss": 0.057, "step": 5110 }, { "epoch": 3.4247491638795986, "grad_norm": 0.7636262774467468, "learning_rate": 6.5752508361204e-05, "loss": 0.0707, "step": 5120 }, { "epoch": 3.431438127090301, "grad_norm": 1.4830290079116821, "learning_rate": 6.5685618729097e-05, "loss": 0.0799, "step": 5130 }, { "epoch": 3.4381270903010033, "grad_norm": 3.003439426422119, "learning_rate": 6.561872909698997e-05, "loss": 0.086, "step": 5140 }, { "epoch": 3.4448160535117056, "grad_norm": 0.34735190868377686, "learning_rate": 6.555183946488295e-05, "loss": 0.0755, "step": 5150 }, { "epoch": 3.451505016722408, "grad_norm": 0.4009545147418976, "learning_rate": 6.548494983277592e-05, "loss": 0.0835, "step": 5160 }, { "epoch": 3.4581939799331103, "grad_norm": 0.18325521051883698, "learning_rate": 6.54180602006689e-05, "loss": 0.0741, "step": 5170 }, { "epoch": 3.4648829431438126, "grad_norm": 0.6062169671058655, "learning_rate": 6.535117056856188e-05, "loss": 0.0679, "step": 5180 }, { "epoch": 3.471571906354515, "grad_norm": 0.6367049217224121, "learning_rate": 6.528428093645485e-05, "loss": 0.0865, "step": 5190 }, { "epoch": 3.4782608695652173, "grad_norm": 3.2684335708618164, "learning_rate": 6.521739130434783e-05, "loss": 0.0671, "step": 5200 }, { "epoch": 3.4849498327759196, "grad_norm": 1.9086042642593384, "learning_rate": 6.51505016722408e-05, "loss": 0.0683, "step": 5210 }, { "epoch": 3.491638795986622, "grad_norm": 3.634255886077881, "learning_rate": 6.508361204013378e-05, "loss": 0.0746, "step": 5220 }, { "epoch": 3.4983277591973243, "grad_norm": 0.94216388463974, "learning_rate": 6.501672240802677e-05, "loss": 0.0774, "step": 5230 }, { "epoch": 3.5050167224080266, "grad_norm": 2.3354969024658203, "learning_rate": 6.494983277591973e-05, "loss": 0.069, "step": 5240 }, { "epoch": 3.511705685618729, "grad_norm": 1.3407578468322754, "learning_rate": 6.488294314381272e-05, "loss": 0.0785, "step": 5250 }, { "epoch": 3.5183946488294313, "grad_norm": 1.516933560371399, "learning_rate": 6.481605351170568e-05, "loss": 0.0837, "step": 5260 }, { "epoch": 3.5250836120401337, "grad_norm": 4.001212120056152, "learning_rate": 6.474916387959867e-05, "loss": 0.0774, "step": 5270 }, { "epoch": 3.531772575250836, "grad_norm": 6.833039283752441, "learning_rate": 6.468227424749163e-05, "loss": 0.0778, "step": 5280 }, { "epoch": 3.5384615384615383, "grad_norm": 0.9666281342506409, "learning_rate": 6.461538461538462e-05, "loss": 0.0628, "step": 5290 }, { "epoch": 3.5451505016722407, "grad_norm": 6.497124195098877, "learning_rate": 6.45484949832776e-05, "loss": 0.077, "step": 5300 }, { "epoch": 3.551839464882943, "grad_norm": 1.4599709510803223, "learning_rate": 6.448160535117057e-05, "loss": 0.0714, "step": 5310 }, { "epoch": 3.5585284280936453, "grad_norm": 3.878899097442627, "learning_rate": 6.441471571906355e-05, "loss": 0.0643, "step": 5320 }, { "epoch": 3.5652173913043477, "grad_norm": 0.6432303786277771, "learning_rate": 6.434782608695652e-05, "loss": 0.0701, "step": 5330 }, { "epoch": 3.57190635451505, "grad_norm": 1.6723068952560425, "learning_rate": 6.428093645484951e-05, "loss": 0.0794, "step": 5340 }, { "epoch": 3.5785953177257523, "grad_norm": 6.770298957824707, "learning_rate": 6.421404682274248e-05, "loss": 0.0909, "step": 5350 }, { "epoch": 3.585284280936455, "grad_norm": 1.5146554708480835, "learning_rate": 6.414715719063545e-05, "loss": 0.0828, "step": 5360 }, { "epoch": 3.591973244147157, "grad_norm": 0.2019585371017456, "learning_rate": 6.408026755852843e-05, "loss": 0.0681, "step": 5370 }, { "epoch": 3.59866220735786, "grad_norm": 0.3240724503993988, "learning_rate": 6.40133779264214e-05, "loss": 0.0846, "step": 5380 }, { "epoch": 3.6053511705685617, "grad_norm": 0.20901960134506226, "learning_rate": 6.39464882943144e-05, "loss": 0.0724, "step": 5390 }, { "epoch": 3.6120401337792645, "grad_norm": 2.741830587387085, "learning_rate": 6.387959866220736e-05, "loss": 0.0717, "step": 5400 }, { "epoch": 3.6187290969899664, "grad_norm": 3.1877925395965576, "learning_rate": 6.381270903010034e-05, "loss": 0.0693, "step": 5410 }, { "epoch": 3.625418060200669, "grad_norm": 1.9705578088760376, "learning_rate": 6.374581939799331e-05, "loss": 0.0648, "step": 5420 }, { "epoch": 3.632107023411371, "grad_norm": 0.9284788370132446, "learning_rate": 6.367892976588628e-05, "loss": 0.064, "step": 5430 }, { "epoch": 3.638795986622074, "grad_norm": 0.4431762993335724, "learning_rate": 6.361204013377928e-05, "loss": 0.0783, "step": 5440 }, { "epoch": 3.6454849498327757, "grad_norm": 2.4773430824279785, "learning_rate": 6.354515050167224e-05, "loss": 0.0885, "step": 5450 }, { "epoch": 3.6521739130434785, "grad_norm": 1.8923101425170898, "learning_rate": 6.347826086956523e-05, "loss": 0.082, "step": 5460 }, { "epoch": 3.6588628762541804, "grad_norm": 4.674037933349609, "learning_rate": 6.34113712374582e-05, "loss": 0.0631, "step": 5470 }, { "epoch": 3.665551839464883, "grad_norm": 0.2656029462814331, "learning_rate": 6.334448160535118e-05, "loss": 0.0736, "step": 5480 }, { "epoch": 3.672240802675585, "grad_norm": 4.043028354644775, "learning_rate": 6.327759197324416e-05, "loss": 0.0697, "step": 5490 }, { "epoch": 3.678929765886288, "grad_norm": 0.7581077218055725, "learning_rate": 6.321070234113713e-05, "loss": 0.0742, "step": 5500 }, { "epoch": 3.6856187290969897, "grad_norm": 0.6322870254516602, "learning_rate": 6.314381270903011e-05, "loss": 0.0704, "step": 5510 }, { "epoch": 3.6923076923076925, "grad_norm": 3.356288433074951, "learning_rate": 6.307692307692308e-05, "loss": 0.0842, "step": 5520 }, { "epoch": 3.6989966555183944, "grad_norm": 1.0668654441833496, "learning_rate": 6.301003344481606e-05, "loss": 0.0595, "step": 5530 }, { "epoch": 3.705685618729097, "grad_norm": 1.2336496114730835, "learning_rate": 6.294314381270904e-05, "loss": 0.0731, "step": 5540 }, { "epoch": 3.712374581939799, "grad_norm": 2.575582504272461, "learning_rate": 6.287625418060201e-05, "loss": 0.0688, "step": 5550 }, { "epoch": 3.719063545150502, "grad_norm": 2.5532429218292236, "learning_rate": 6.280936454849499e-05, "loss": 0.0786, "step": 5560 }, { "epoch": 3.7257525083612038, "grad_norm": 0.3744869828224182, "learning_rate": 6.274247491638796e-05, "loss": 0.081, "step": 5570 }, { "epoch": 3.7324414715719065, "grad_norm": 1.8331739902496338, "learning_rate": 6.267558528428094e-05, "loss": 0.0729, "step": 5580 }, { "epoch": 3.7391304347826084, "grad_norm": 3.997131586074829, "learning_rate": 6.260869565217392e-05, "loss": 0.0744, "step": 5590 }, { "epoch": 3.745819397993311, "grad_norm": 0.29028135538101196, "learning_rate": 6.254180602006689e-05, "loss": 0.0736, "step": 5600 }, { "epoch": 3.7525083612040135, "grad_norm": 1.71115243434906, "learning_rate": 6.247491638795987e-05, "loss": 0.0649, "step": 5610 }, { "epoch": 3.759197324414716, "grad_norm": 0.6625193953514099, "learning_rate": 6.240802675585284e-05, "loss": 0.0844, "step": 5620 }, { "epoch": 3.765886287625418, "grad_norm": 1.3801952600479126, "learning_rate": 6.234113712374582e-05, "loss": 0.0715, "step": 5630 }, { "epoch": 3.7725752508361206, "grad_norm": 0.7554713487625122, "learning_rate": 6.22742474916388e-05, "loss": 0.0667, "step": 5640 }, { "epoch": 3.779264214046823, "grad_norm": 1.6294810771942139, "learning_rate": 6.220735785953178e-05, "loss": 0.0836, "step": 5650 }, { "epoch": 3.7859531772575252, "grad_norm": 1.9828605651855469, "learning_rate": 6.214046822742475e-05, "loss": 0.096, "step": 5660 }, { "epoch": 3.7926421404682276, "grad_norm": 3.5717391967773438, "learning_rate": 6.207357859531772e-05, "loss": 0.0784, "step": 5670 }, { "epoch": 3.79933110367893, "grad_norm": 1.6683013439178467, "learning_rate": 6.20066889632107e-05, "loss": 0.0767, "step": 5680 }, { "epoch": 3.8060200668896322, "grad_norm": 0.3348728120326996, "learning_rate": 6.193979933110368e-05, "loss": 0.0639, "step": 5690 }, { "epoch": 3.8127090301003346, "grad_norm": 0.779590904712677, "learning_rate": 6.187290969899667e-05, "loss": 0.075, "step": 5700 }, { "epoch": 3.819397993311037, "grad_norm": 0.2324654906988144, "learning_rate": 6.180602006688964e-05, "loss": 0.0701, "step": 5710 }, { "epoch": 3.8260869565217392, "grad_norm": 0.3135102689266205, "learning_rate": 6.173913043478262e-05, "loss": 0.0659, "step": 5720 }, { "epoch": 3.8327759197324416, "grad_norm": 1.7761188745498657, "learning_rate": 6.167224080267559e-05, "loss": 0.0852, "step": 5730 }, { "epoch": 3.839464882943144, "grad_norm": 6.760908603668213, "learning_rate": 6.160535117056855e-05, "loss": 0.0892, "step": 5740 }, { "epoch": 3.8461538461538463, "grad_norm": 0.9731970429420471, "learning_rate": 6.153846153846155e-05, "loss": 0.079, "step": 5750 }, { "epoch": 3.8528428093645486, "grad_norm": 2.6022255420684814, "learning_rate": 6.147157190635452e-05, "loss": 0.0601, "step": 5760 }, { "epoch": 3.859531772575251, "grad_norm": 4.185039520263672, "learning_rate": 6.14046822742475e-05, "loss": 0.0726, "step": 5770 }, { "epoch": 3.8662207357859533, "grad_norm": 0.7964357137680054, "learning_rate": 6.133779264214047e-05, "loss": 0.0745, "step": 5780 }, { "epoch": 3.8729096989966556, "grad_norm": 0.5310606956481934, "learning_rate": 6.127090301003345e-05, "loss": 0.0725, "step": 5790 }, { "epoch": 3.879598662207358, "grad_norm": 1.5212914943695068, "learning_rate": 6.120401337792643e-05, "loss": 0.0598, "step": 5800 }, { "epoch": 3.8862876254180603, "grad_norm": 3.6158175468444824, "learning_rate": 6.11371237458194e-05, "loss": 0.0717, "step": 5810 }, { "epoch": 3.8929765886287626, "grad_norm": 0.4467984139919281, "learning_rate": 6.107023411371238e-05, "loss": 0.0625, "step": 5820 }, { "epoch": 3.899665551839465, "grad_norm": 3.0646955966949463, "learning_rate": 6.100334448160535e-05, "loss": 0.0607, "step": 5830 }, { "epoch": 3.9063545150501673, "grad_norm": 2.663433313369751, "learning_rate": 6.0936454849498324e-05, "loss": 0.0595, "step": 5840 }, { "epoch": 3.9130434782608696, "grad_norm": 3.1213862895965576, "learning_rate": 6.086956521739131e-05, "loss": 0.0649, "step": 5850 }, { "epoch": 3.919732441471572, "grad_norm": 2.0309112071990967, "learning_rate": 6.080267558528429e-05, "loss": 0.0699, "step": 5860 }, { "epoch": 3.9264214046822743, "grad_norm": 2.9740498065948486, "learning_rate": 6.073578595317726e-05, "loss": 0.063, "step": 5870 }, { "epoch": 3.9331103678929766, "grad_norm": 3.238435745239258, "learning_rate": 6.066889632107024e-05, "loss": 0.0731, "step": 5880 }, { "epoch": 3.939799331103679, "grad_norm": 1.1853530406951904, "learning_rate": 6.0602006688963206e-05, "loss": 0.0932, "step": 5890 }, { "epoch": 3.9464882943143813, "grad_norm": 0.3700926601886749, "learning_rate": 6.0535117056856194e-05, "loss": 0.0819, "step": 5900 }, { "epoch": 3.9531772575250836, "grad_norm": 2.7900564670562744, "learning_rate": 6.046822742474917e-05, "loss": 0.0664, "step": 5910 }, { "epoch": 3.959866220735786, "grad_norm": 0.8620715141296387, "learning_rate": 6.0401337792642145e-05, "loss": 0.0774, "step": 5920 }, { "epoch": 3.9665551839464883, "grad_norm": 1.8979065418243408, "learning_rate": 6.033444816053512e-05, "loss": 0.0733, "step": 5930 }, { "epoch": 3.9732441471571907, "grad_norm": 3.0854146480560303, "learning_rate": 6.0267558528428095e-05, "loss": 0.0993, "step": 5940 }, { "epoch": 3.979933110367893, "grad_norm": 0.19406002759933472, "learning_rate": 6.0200668896321076e-05, "loss": 0.0686, "step": 5950 }, { "epoch": 3.9866220735785953, "grad_norm": 2.1533548831939697, "learning_rate": 6.013377926421405e-05, "loss": 0.0719, "step": 5960 }, { "epoch": 3.9933110367892977, "grad_norm": 1.405422329902649, "learning_rate": 6.0066889632107026e-05, "loss": 0.0624, "step": 5970 }, { "epoch": 4.0, "grad_norm": 1.511916160583496, "learning_rate": 6e-05, "loss": 0.0779, "step": 5980 }, { "epoch": 4.0, "eval_loss": 0.07555365562438965, "eval_mse": 0.07555365562438965, "eval_runtime": 220.2714, "eval_samples_per_second": 13.574, "eval_steps_per_second": 1.698, "step": 5980 }, { "epoch": 4.006688963210703, "grad_norm": 0.7200626134872437, "learning_rate": 5.9933110367892977e-05, "loss": 0.0659, "step": 5990 }, { "epoch": 4.013377926421405, "grad_norm": 7.074632167816162, "learning_rate": 5.986622073578596e-05, "loss": 0.0949, "step": 6000 }, { "epoch": 4.0200668896321075, "grad_norm": 1.3575361967086792, "learning_rate": 5.9799331103678933e-05, "loss": 0.0656, "step": 6010 }, { "epoch": 4.026755852842809, "grad_norm": 3.287247657775879, "learning_rate": 5.973244147157191e-05, "loss": 0.0626, "step": 6020 }, { "epoch": 4.033444816053512, "grad_norm": 3.9007692337036133, "learning_rate": 5.9665551839464883e-05, "loss": 0.0531, "step": 6030 }, { "epoch": 4.040133779264214, "grad_norm": 0.33864256739616394, "learning_rate": 5.959866220735786e-05, "loss": 0.0533, "step": 6040 }, { "epoch": 4.046822742474917, "grad_norm": 1.0063756704330444, "learning_rate": 5.953177257525085e-05, "loss": 0.076, "step": 6050 }, { "epoch": 4.053511705685619, "grad_norm": 1.4410839080810547, "learning_rate": 5.9464882943143815e-05, "loss": 0.0621, "step": 6060 }, { "epoch": 4.0602006688963215, "grad_norm": 2.621447801589966, "learning_rate": 5.939799331103679e-05, "loss": 0.0666, "step": 6070 }, { "epoch": 4.066889632107023, "grad_norm": 2.981430768966675, "learning_rate": 5.9331103678929765e-05, "loss": 0.0608, "step": 6080 }, { "epoch": 4.073578595317726, "grad_norm": 1.6998625993728638, "learning_rate": 5.926421404682274e-05, "loss": 0.0733, "step": 6090 }, { "epoch": 4.080267558528428, "grad_norm": 0.7376275062561035, "learning_rate": 5.919732441471573e-05, "loss": 0.0707, "step": 6100 }, { "epoch": 4.086956521739131, "grad_norm": 0.9681304693222046, "learning_rate": 5.9130434782608704e-05, "loss": 0.0633, "step": 6110 }, { "epoch": 4.093645484949833, "grad_norm": 1.3118129968643188, "learning_rate": 5.906354515050168e-05, "loss": 0.0586, "step": 6120 }, { "epoch": 4.1003344481605355, "grad_norm": 4.774895191192627, "learning_rate": 5.899665551839465e-05, "loss": 0.067, "step": 6130 }, { "epoch": 4.107023411371237, "grad_norm": 0.9589107632637024, "learning_rate": 5.892976588628762e-05, "loss": 0.0631, "step": 6140 }, { "epoch": 4.11371237458194, "grad_norm": 2.461914300918579, "learning_rate": 5.886287625418061e-05, "loss": 0.0714, "step": 6150 }, { "epoch": 4.120401337792642, "grad_norm": 2.1283793449401855, "learning_rate": 5.8795986622073586e-05, "loss": 0.0841, "step": 6160 }, { "epoch": 4.127090301003345, "grad_norm": 0.8651756048202515, "learning_rate": 5.872909698996656e-05, "loss": 0.0668, "step": 6170 }, { "epoch": 4.133779264214047, "grad_norm": 2.440626859664917, "learning_rate": 5.8662207357859536e-05, "loss": 0.0763, "step": 6180 }, { "epoch": 4.1404682274247495, "grad_norm": 3.1031172275543213, "learning_rate": 5.859531772575251e-05, "loss": 0.0702, "step": 6190 }, { "epoch": 4.147157190635451, "grad_norm": 3.748271942138672, "learning_rate": 5.852842809364549e-05, "loss": 0.0716, "step": 6200 }, { "epoch": 4.153846153846154, "grad_norm": 1.2731395959854126, "learning_rate": 5.846153846153847e-05, "loss": 0.0847, "step": 6210 }, { "epoch": 4.160535117056856, "grad_norm": 0.48583781719207764, "learning_rate": 5.839464882943144e-05, "loss": 0.0604, "step": 6220 }, { "epoch": 4.167224080267559, "grad_norm": 0.9449681639671326, "learning_rate": 5.832775919732442e-05, "loss": 0.075, "step": 6230 }, { "epoch": 4.173913043478261, "grad_norm": 1.5034922361373901, "learning_rate": 5.826086956521739e-05, "loss": 0.0795, "step": 6240 }, { "epoch": 4.1806020066889635, "grad_norm": 6.206334114074707, "learning_rate": 5.819397993311037e-05, "loss": 0.0714, "step": 6250 }, { "epoch": 4.187290969899665, "grad_norm": 1.9744735956192017, "learning_rate": 5.812709030100335e-05, "loss": 0.0671, "step": 6260 }, { "epoch": 4.193979933110368, "grad_norm": 1.4443360567092896, "learning_rate": 5.8060200668896325e-05, "loss": 0.0628, "step": 6270 }, { "epoch": 4.20066889632107, "grad_norm": 3.5589351654052734, "learning_rate": 5.79933110367893e-05, "loss": 0.0724, "step": 6280 }, { "epoch": 4.207357859531773, "grad_norm": 1.0574309825897217, "learning_rate": 5.7926421404682275e-05, "loss": 0.0595, "step": 6290 }, { "epoch": 4.214046822742475, "grad_norm": 0.8373066186904907, "learning_rate": 5.785953177257525e-05, "loss": 0.0755, "step": 6300 }, { "epoch": 4.2207357859531776, "grad_norm": 0.5989928841590881, "learning_rate": 5.779264214046823e-05, "loss": 0.0708, "step": 6310 }, { "epoch": 4.2274247491638794, "grad_norm": 3.6913678646087646, "learning_rate": 5.772575250836121e-05, "loss": 0.0731, "step": 6320 }, { "epoch": 4.234113712374582, "grad_norm": 6.884210586547852, "learning_rate": 5.765886287625418e-05, "loss": 0.0782, "step": 6330 }, { "epoch": 4.240802675585284, "grad_norm": 2.3835830688476562, "learning_rate": 5.759197324414716e-05, "loss": 0.0708, "step": 6340 }, { "epoch": 4.247491638795987, "grad_norm": 1.463895559310913, "learning_rate": 5.752508361204013e-05, "loss": 0.0583, "step": 6350 }, { "epoch": 4.254180602006689, "grad_norm": 3.8699686527252197, "learning_rate": 5.7458193979933114e-05, "loss": 0.0588, "step": 6360 }, { "epoch": 4.260869565217392, "grad_norm": 0.9512161016464233, "learning_rate": 5.739130434782609e-05, "loss": 0.0647, "step": 6370 }, { "epoch": 4.2675585284280935, "grad_norm": 0.32455405592918396, "learning_rate": 5.7324414715719064e-05, "loss": 0.063, "step": 6380 }, { "epoch": 4.274247491638796, "grad_norm": 0.3892381191253662, "learning_rate": 5.725752508361204e-05, "loss": 0.0945, "step": 6390 }, { "epoch": 4.280936454849498, "grad_norm": 0.3295302093029022, "learning_rate": 5.7190635451505014e-05, "loss": 0.0554, "step": 6400 }, { "epoch": 4.287625418060201, "grad_norm": 0.6870085000991821, "learning_rate": 5.7123745819398e-05, "loss": 0.0813, "step": 6410 }, { "epoch": 4.294314381270903, "grad_norm": 0.5889727473258972, "learning_rate": 5.705685618729098e-05, "loss": 0.0788, "step": 6420 }, { "epoch": 4.301003344481606, "grad_norm": 2.1253087520599365, "learning_rate": 5.6989966555183946e-05, "loss": 0.071, "step": 6430 }, { "epoch": 4.3076923076923075, "grad_norm": 2.5222198963165283, "learning_rate": 5.692307692307692e-05, "loss": 0.0553, "step": 6440 }, { "epoch": 4.31438127090301, "grad_norm": 3.0310568809509277, "learning_rate": 5.6856187290969896e-05, "loss": 0.0863, "step": 6450 }, { "epoch": 4.321070234113712, "grad_norm": 5.671077251434326, "learning_rate": 5.6789297658862884e-05, "loss": 0.0625, "step": 6460 }, { "epoch": 4.327759197324415, "grad_norm": 2.766726493835449, "learning_rate": 5.672240802675586e-05, "loss": 0.0711, "step": 6470 }, { "epoch": 4.334448160535117, "grad_norm": 0.4096842110157013, "learning_rate": 5.6655518394648834e-05, "loss": 0.0671, "step": 6480 }, { "epoch": 4.34113712374582, "grad_norm": 8.858095169067383, "learning_rate": 5.658862876254181e-05, "loss": 0.0789, "step": 6490 }, { "epoch": 4.3478260869565215, "grad_norm": 0.9603244066238403, "learning_rate": 5.652173913043478e-05, "loss": 0.0638, "step": 6500 }, { "epoch": 4.354515050167224, "grad_norm": 0.19360791146755219, "learning_rate": 5.6454849498327766e-05, "loss": 0.0669, "step": 6510 }, { "epoch": 4.361204013377926, "grad_norm": 0.7423932552337646, "learning_rate": 5.638795986622074e-05, "loss": 0.0712, "step": 6520 }, { "epoch": 4.367892976588629, "grad_norm": 0.4607205092906952, "learning_rate": 5.6321070234113716e-05, "loss": 0.0818, "step": 6530 }, { "epoch": 4.374581939799331, "grad_norm": 0.6850534677505493, "learning_rate": 5.625418060200669e-05, "loss": 0.0709, "step": 6540 }, { "epoch": 4.381270903010034, "grad_norm": 1.2787178754806519, "learning_rate": 5.6187290969899666e-05, "loss": 0.09, "step": 6550 }, { "epoch": 4.3879598662207355, "grad_norm": 0.7124513387680054, "learning_rate": 5.612040133779265e-05, "loss": 0.0683, "step": 6560 }, { "epoch": 4.394648829431438, "grad_norm": 1.9447962045669556, "learning_rate": 5.605351170568562e-05, "loss": 0.0674, "step": 6570 }, { "epoch": 4.40133779264214, "grad_norm": 3.2783830165863037, "learning_rate": 5.59866220735786e-05, "loss": 0.0754, "step": 6580 }, { "epoch": 4.408026755852843, "grad_norm": 1.072465419769287, "learning_rate": 5.591973244147157e-05, "loss": 0.0757, "step": 6590 }, { "epoch": 4.414715719063545, "grad_norm": 4.134634971618652, "learning_rate": 5.585284280936455e-05, "loss": 0.0573, "step": 6600 }, { "epoch": 4.421404682274248, "grad_norm": 6.002267837524414, "learning_rate": 5.578595317725753e-05, "loss": 0.074, "step": 6610 }, { "epoch": 4.4280936454849495, "grad_norm": 1.9408398866653442, "learning_rate": 5.5719063545150505e-05, "loss": 0.0671, "step": 6620 }, { "epoch": 4.434782608695652, "grad_norm": 1.891671895980835, "learning_rate": 5.565217391304348e-05, "loss": 0.0655, "step": 6630 }, { "epoch": 4.441471571906354, "grad_norm": 5.870092391967773, "learning_rate": 5.5585284280936455e-05, "loss": 0.069, "step": 6640 }, { "epoch": 4.448160535117057, "grad_norm": 5.0089921951293945, "learning_rate": 5.551839464882943e-05, "loss": 0.0658, "step": 6650 }, { "epoch": 4.454849498327759, "grad_norm": 1.1499357223510742, "learning_rate": 5.545150501672242e-05, "loss": 0.0855, "step": 6660 }, { "epoch": 4.461538461538462, "grad_norm": 2.1092236042022705, "learning_rate": 5.538461538461539e-05, "loss": 0.0826, "step": 6670 }, { "epoch": 4.468227424749164, "grad_norm": 2.6977195739746094, "learning_rate": 5.531772575250836e-05, "loss": 0.0622, "step": 6680 }, { "epoch": 4.474916387959866, "grad_norm": 2.9836738109588623, "learning_rate": 5.525083612040134e-05, "loss": 0.0659, "step": 6690 }, { "epoch": 4.481605351170568, "grad_norm": 0.8441120982170105, "learning_rate": 5.518394648829431e-05, "loss": 0.0663, "step": 6700 }, { "epoch": 4.488294314381271, "grad_norm": 1.7105242013931274, "learning_rate": 5.51170568561873e-05, "loss": 0.0787, "step": 6710 }, { "epoch": 4.494983277591973, "grad_norm": 3.3104045391082764, "learning_rate": 5.5050167224080276e-05, "loss": 0.072, "step": 6720 }, { "epoch": 4.501672240802676, "grad_norm": 0.2517462372779846, "learning_rate": 5.498327759197325e-05, "loss": 0.0592, "step": 6730 }, { "epoch": 4.508361204013378, "grad_norm": 3.130553722381592, "learning_rate": 5.491638795986622e-05, "loss": 0.0597, "step": 6740 }, { "epoch": 4.51505016722408, "grad_norm": 2.540755033493042, "learning_rate": 5.4849498327759194e-05, "loss": 0.0527, "step": 6750 }, { "epoch": 4.521739130434782, "grad_norm": 4.94587516784668, "learning_rate": 5.478260869565217e-05, "loss": 0.0755, "step": 6760 }, { "epoch": 4.528428093645485, "grad_norm": 5.80070161819458, "learning_rate": 5.471571906354516e-05, "loss": 0.0653, "step": 6770 }, { "epoch": 4.535117056856187, "grad_norm": 1.3168888092041016, "learning_rate": 5.464882943143813e-05, "loss": 0.074, "step": 6780 }, { "epoch": 4.54180602006689, "grad_norm": 1.5611118078231812, "learning_rate": 5.458193979933111e-05, "loss": 0.0709, "step": 6790 }, { "epoch": 4.548494983277592, "grad_norm": 2.30637264251709, "learning_rate": 5.451505016722408e-05, "loss": 0.0769, "step": 6800 }, { "epoch": 4.555183946488294, "grad_norm": 2.6152946949005127, "learning_rate": 5.444816053511705e-05, "loss": 0.079, "step": 6810 }, { "epoch": 4.561872909698996, "grad_norm": 0.3139130473136902, "learning_rate": 5.438127090301004e-05, "loss": 0.055, "step": 6820 }, { "epoch": 4.568561872909699, "grad_norm": 0.623741626739502, "learning_rate": 5.4314381270903015e-05, "loss": 0.0619, "step": 6830 }, { "epoch": 4.575250836120401, "grad_norm": 1.389875054359436, "learning_rate": 5.424749163879599e-05, "loss": 0.0686, "step": 6840 }, { "epoch": 4.581939799331104, "grad_norm": 0.4257790744304657, "learning_rate": 5.4180602006688965e-05, "loss": 0.0872, "step": 6850 }, { "epoch": 4.588628762541806, "grad_norm": 0.7867361307144165, "learning_rate": 5.411371237458194e-05, "loss": 0.0746, "step": 6860 }, { "epoch": 4.595317725752508, "grad_norm": 1.4453585147857666, "learning_rate": 5.404682274247492e-05, "loss": 0.0605, "step": 6870 }, { "epoch": 4.602006688963211, "grad_norm": 1.342901349067688, "learning_rate": 5.3979933110367897e-05, "loss": 0.0675, "step": 6880 }, { "epoch": 4.608695652173913, "grad_norm": 2.2974514961242676, "learning_rate": 5.391304347826087e-05, "loss": 0.0748, "step": 6890 }, { "epoch": 4.615384615384615, "grad_norm": 6.246169567108154, "learning_rate": 5.384615384615385e-05, "loss": 0.0907, "step": 6900 }, { "epoch": 4.622073578595318, "grad_norm": 0.721906304359436, "learning_rate": 5.377926421404682e-05, "loss": 0.0707, "step": 6910 }, { "epoch": 4.6287625418060205, "grad_norm": 0.5669277310371399, "learning_rate": 5.3712374581939803e-05, "loss": 0.0728, "step": 6920 }, { "epoch": 4.635451505016722, "grad_norm": 6.285121917724609, "learning_rate": 5.364548494983278e-05, "loss": 0.0923, "step": 6930 }, { "epoch": 4.642140468227424, "grad_norm": 1.2872087955474854, "learning_rate": 5.3578595317725754e-05, "loss": 0.0735, "step": 6940 }, { "epoch": 4.648829431438127, "grad_norm": 3.112480878829956, "learning_rate": 5.351170568561873e-05, "loss": 0.0638, "step": 6950 }, { "epoch": 4.65551839464883, "grad_norm": 3.5378661155700684, "learning_rate": 5.3444816053511704e-05, "loss": 0.0825, "step": 6960 }, { "epoch": 4.662207357859532, "grad_norm": 0.5296181440353394, "learning_rate": 5.337792642140469e-05, "loss": 0.0756, "step": 6970 }, { "epoch": 4.668896321070234, "grad_norm": 5.031908988952637, "learning_rate": 5.331103678929766e-05, "loss": 0.077, "step": 6980 }, { "epoch": 4.6755852842809364, "grad_norm": 0.9596044421195984, "learning_rate": 5.3244147157190635e-05, "loss": 0.0684, "step": 6990 }, { "epoch": 4.682274247491639, "grad_norm": 1.9678120613098145, "learning_rate": 5.317725752508361e-05, "loss": 0.0816, "step": 7000 }, { "epoch": 4.688963210702341, "grad_norm": 2.694352149963379, "learning_rate": 5.3110367892976586e-05, "loss": 0.0808, "step": 7010 }, { "epoch": 4.695652173913043, "grad_norm": 2.666954517364502, "learning_rate": 5.3043478260869574e-05, "loss": 0.0788, "step": 7020 }, { "epoch": 4.702341137123746, "grad_norm": 0.9346604347229004, "learning_rate": 5.297658862876255e-05, "loss": 0.0742, "step": 7030 }, { "epoch": 4.709030100334449, "grad_norm": 1.9348862171173096, "learning_rate": 5.2909698996655524e-05, "loss": 0.0748, "step": 7040 }, { "epoch": 4.7157190635451505, "grad_norm": 0.41695094108581543, "learning_rate": 5.284280936454849e-05, "loss": 0.0708, "step": 7050 }, { "epoch": 4.722408026755852, "grad_norm": 1.0469141006469727, "learning_rate": 5.277591973244147e-05, "loss": 0.0875, "step": 7060 }, { "epoch": 4.729096989966555, "grad_norm": 1.3788455724716187, "learning_rate": 5.2709030100334456e-05, "loss": 0.0542, "step": 7070 }, { "epoch": 4.735785953177258, "grad_norm": 1.5151710510253906, "learning_rate": 5.264214046822743e-05, "loss": 0.0576, "step": 7080 }, { "epoch": 4.74247491638796, "grad_norm": 5.557274341583252, "learning_rate": 5.2575250836120406e-05, "loss": 0.0848, "step": 7090 }, { "epoch": 4.749163879598662, "grad_norm": 3.972078561782837, "learning_rate": 5.250836120401338e-05, "loss": 0.0757, "step": 7100 }, { "epoch": 4.7558528428093645, "grad_norm": 1.610681176185608, "learning_rate": 5.2441471571906356e-05, "loss": 0.0678, "step": 7110 }, { "epoch": 4.762541806020067, "grad_norm": 3.7423255443573, "learning_rate": 5.237458193979934e-05, "loss": 0.0767, "step": 7120 }, { "epoch": 4.769230769230769, "grad_norm": 1.9917365312576294, "learning_rate": 5.230769230769231e-05, "loss": 0.0757, "step": 7130 }, { "epoch": 4.775919732441472, "grad_norm": 0.4224523901939392, "learning_rate": 5.224080267558529e-05, "loss": 0.0613, "step": 7140 }, { "epoch": 4.782608695652174, "grad_norm": 2.9698522090911865, "learning_rate": 5.217391304347826e-05, "loss": 0.0731, "step": 7150 }, { "epoch": 4.789297658862877, "grad_norm": 1.6401015520095825, "learning_rate": 5.210702341137124e-05, "loss": 0.0685, "step": 7160 }, { "epoch": 4.7959866220735785, "grad_norm": 1.937436580657959, "learning_rate": 5.204013377926422e-05, "loss": 0.0662, "step": 7170 }, { "epoch": 4.802675585284281, "grad_norm": 2.4356794357299805, "learning_rate": 5.1973244147157195e-05, "loss": 0.0585, "step": 7180 }, { "epoch": 4.809364548494983, "grad_norm": 1.4742059707641602, "learning_rate": 5.190635451505017e-05, "loss": 0.0684, "step": 7190 }, { "epoch": 4.816053511705686, "grad_norm": 4.7270965576171875, "learning_rate": 5.1839464882943145e-05, "loss": 0.0754, "step": 7200 }, { "epoch": 4.822742474916388, "grad_norm": 2.557396173477173, "learning_rate": 5.177257525083612e-05, "loss": 0.0626, "step": 7210 }, { "epoch": 4.829431438127091, "grad_norm": 3.0992376804351807, "learning_rate": 5.17056856187291e-05, "loss": 0.0718, "step": 7220 }, { "epoch": 4.8361204013377925, "grad_norm": 3.5227317810058594, "learning_rate": 5.163879598662208e-05, "loss": 0.0696, "step": 7230 }, { "epoch": 4.842809364548495, "grad_norm": 0.42812949419021606, "learning_rate": 5.157190635451505e-05, "loss": 0.0622, "step": 7240 }, { "epoch": 4.849498327759197, "grad_norm": 0.9504969120025635, "learning_rate": 5.150501672240803e-05, "loss": 0.0587, "step": 7250 }, { "epoch": 4.8561872909699, "grad_norm": 1.9355140924453735, "learning_rate": 5.1438127090301e-05, "loss": 0.0721, "step": 7260 }, { "epoch": 4.862876254180602, "grad_norm": 1.7390964031219482, "learning_rate": 5.137123745819398e-05, "loss": 0.0733, "step": 7270 }, { "epoch": 4.869565217391305, "grad_norm": 0.4878229796886444, "learning_rate": 5.1304347826086966e-05, "loss": 0.0678, "step": 7280 }, { "epoch": 4.8762541806020065, "grad_norm": 0.5825105905532837, "learning_rate": 5.1237458193979934e-05, "loss": 0.0633, "step": 7290 }, { "epoch": 4.882943143812709, "grad_norm": 1.2219446897506714, "learning_rate": 5.117056856187291e-05, "loss": 0.074, "step": 7300 }, { "epoch": 4.889632107023411, "grad_norm": 7.823890209197998, "learning_rate": 5.1103678929765884e-05, "loss": 0.0663, "step": 7310 }, { "epoch": 4.896321070234114, "grad_norm": 1.276883602142334, "learning_rate": 5.103678929765886e-05, "loss": 0.068, "step": 7320 }, { "epoch": 4.903010033444816, "grad_norm": 0.39954355359077454, "learning_rate": 5.096989966555185e-05, "loss": 0.0672, "step": 7330 }, { "epoch": 4.909698996655519, "grad_norm": 1.9998856782913208, "learning_rate": 5.090301003344482e-05, "loss": 0.0663, "step": 7340 }, { "epoch": 4.916387959866221, "grad_norm": 3.3592422008514404, "learning_rate": 5.08361204013378e-05, "loss": 0.0717, "step": 7350 }, { "epoch": 4.923076923076923, "grad_norm": 2.421261787414551, "learning_rate": 5.0769230769230766e-05, "loss": 0.0778, "step": 7360 }, { "epoch": 4.929765886287625, "grad_norm": 3.719966411590576, "learning_rate": 5.070234113712374e-05, "loss": 0.067, "step": 7370 }, { "epoch": 4.936454849498328, "grad_norm": 0.6203343868255615, "learning_rate": 5.063545150501673e-05, "loss": 0.0643, "step": 7380 }, { "epoch": 4.94314381270903, "grad_norm": 0.692283570766449, "learning_rate": 5.0568561872909704e-05, "loss": 0.0684, "step": 7390 }, { "epoch": 4.949832775919733, "grad_norm": 1.4749492406845093, "learning_rate": 5.050167224080268e-05, "loss": 0.0746, "step": 7400 }, { "epoch": 4.956521739130435, "grad_norm": 2.1798877716064453, "learning_rate": 5.0434782608695655e-05, "loss": 0.0655, "step": 7410 }, { "epoch": 4.963210702341137, "grad_norm": 2.3222920894622803, "learning_rate": 5.036789297658863e-05, "loss": 0.0658, "step": 7420 }, { "epoch": 4.969899665551839, "grad_norm": 3.0843615531921387, "learning_rate": 5.030100334448161e-05, "loss": 0.0573, "step": 7430 }, { "epoch": 4.976588628762542, "grad_norm": 2.0864250659942627, "learning_rate": 5.0234113712374586e-05, "loss": 0.0607, "step": 7440 }, { "epoch": 4.983277591973244, "grad_norm": 2.6795477867126465, "learning_rate": 5.016722408026756e-05, "loss": 0.0631, "step": 7450 }, { "epoch": 4.989966555183947, "grad_norm": 0.6230664849281311, "learning_rate": 5.0100334448160536e-05, "loss": 0.066, "step": 7460 }, { "epoch": 4.996655518394649, "grad_norm": 2.3801419734954834, "learning_rate": 5.003344481605351e-05, "loss": 0.0841, "step": 7470 }, { "epoch": 5.0, "eval_loss": 0.07865732908248901, "eval_mse": 0.07865732908248901, "eval_runtime": 221.7747, "eval_samples_per_second": 13.482, "eval_steps_per_second": 1.686, "step": 7475 }, { "epoch": 5.003344481605351, "grad_norm": 3.762542486190796, "learning_rate": 4.9966555183946487e-05, "loss": 0.062, "step": 7480 }, { "epoch": 5.010033444816053, "grad_norm": 1.8806613683700562, "learning_rate": 4.989966555183947e-05, "loss": 0.0603, "step": 7490 }, { "epoch": 5.016722408026756, "grad_norm": 5.983439922332764, "learning_rate": 4.983277591973244e-05, "loss": 0.0627, "step": 7500 }, { "epoch": 5.023411371237458, "grad_norm": 1.454197645187378, "learning_rate": 4.976588628762542e-05, "loss": 0.0464, "step": 7510 }, { "epoch": 5.030100334448161, "grad_norm": 3.129765510559082, "learning_rate": 4.96989966555184e-05, "loss": 0.0504, "step": 7520 }, { "epoch": 5.036789297658863, "grad_norm": 1.3404700756072998, "learning_rate": 4.9632107023411375e-05, "loss": 0.0671, "step": 7530 }, { "epoch": 5.043478260869565, "grad_norm": 4.357332706451416, "learning_rate": 4.956521739130435e-05, "loss": 0.0741, "step": 7540 }, { "epoch": 5.050167224080267, "grad_norm": 1.8956897258758545, "learning_rate": 4.9498327759197325e-05, "loss": 0.0676, "step": 7550 }, { "epoch": 5.05685618729097, "grad_norm": 5.321130275726318, "learning_rate": 4.94314381270903e-05, "loss": 0.0732, "step": 7560 }, { "epoch": 5.063545150501672, "grad_norm": 0.6879299283027649, "learning_rate": 4.936454849498328e-05, "loss": 0.0569, "step": 7570 }, { "epoch": 5.070234113712375, "grad_norm": 2.887319326400757, "learning_rate": 4.929765886287626e-05, "loss": 0.0625, "step": 7580 }, { "epoch": 5.076923076923077, "grad_norm": 5.584492206573486, "learning_rate": 4.923076923076924e-05, "loss": 0.0664, "step": 7590 }, { "epoch": 5.083612040133779, "grad_norm": 7.167271137237549, "learning_rate": 4.916387959866221e-05, "loss": 0.0637, "step": 7600 }, { "epoch": 5.090301003344481, "grad_norm": 1.2168267965316772, "learning_rate": 4.909698996655518e-05, "loss": 0.0751, "step": 7610 }, { "epoch": 5.096989966555184, "grad_norm": 3.569265127182007, "learning_rate": 4.9030100334448164e-05, "loss": 0.0605, "step": 7620 }, { "epoch": 5.103678929765886, "grad_norm": 1.1153556108474731, "learning_rate": 4.896321070234114e-05, "loss": 0.0662, "step": 7630 }, { "epoch": 5.110367892976589, "grad_norm": 0.45676007866859436, "learning_rate": 4.8896321070234114e-05, "loss": 0.0531, "step": 7640 }, { "epoch": 5.117056856187291, "grad_norm": 3.829061269760132, "learning_rate": 4.8829431438127096e-05, "loss": 0.0575, "step": 7650 }, { "epoch": 5.1237458193979935, "grad_norm": 1.676592469215393, "learning_rate": 4.876254180602007e-05, "loss": 0.06, "step": 7660 }, { "epoch": 5.130434782608695, "grad_norm": 0.6828649640083313, "learning_rate": 4.8695652173913046e-05, "loss": 0.0578, "step": 7670 }, { "epoch": 5.137123745819398, "grad_norm": 3.1590874195098877, "learning_rate": 4.862876254180602e-05, "loss": 0.0775, "step": 7680 }, { "epoch": 5.1438127090301, "grad_norm": 0.5831063985824585, "learning_rate": 4.8561872909698996e-05, "loss": 0.0808, "step": 7690 }, { "epoch": 5.150501672240803, "grad_norm": 0.17127808928489685, "learning_rate": 4.849498327759198e-05, "loss": 0.0548, "step": 7700 }, { "epoch": 5.157190635451505, "grad_norm": 3.0591585636138916, "learning_rate": 4.842809364548495e-05, "loss": 0.0789, "step": 7710 }, { "epoch": 5.1638795986622075, "grad_norm": 0.9182135462760925, "learning_rate": 4.836120401337793e-05, "loss": 0.0649, "step": 7720 }, { "epoch": 5.170568561872909, "grad_norm": 0.8472630977630615, "learning_rate": 4.82943143812709e-05, "loss": 0.0585, "step": 7730 }, { "epoch": 5.177257525083612, "grad_norm": 2.960695266723633, "learning_rate": 4.822742474916388e-05, "loss": 0.0596, "step": 7740 }, { "epoch": 5.183946488294314, "grad_norm": 1.086195468902588, "learning_rate": 4.816053511705686e-05, "loss": 0.0599, "step": 7750 }, { "epoch": 5.190635451505017, "grad_norm": 3.7133307456970215, "learning_rate": 4.8093645484949835e-05, "loss": 0.05, "step": 7760 }, { "epoch": 5.197324414715719, "grad_norm": 1.9042307138442993, "learning_rate": 4.802675585284282e-05, "loss": 0.0653, "step": 7770 }, { "epoch": 5.2040133779264215, "grad_norm": 2.424346685409546, "learning_rate": 4.795986622073579e-05, "loss": 0.0531, "step": 7780 }, { "epoch": 5.210702341137123, "grad_norm": 0.8446156978607178, "learning_rate": 4.789297658862876e-05, "loss": 0.0542, "step": 7790 }, { "epoch": 5.217391304347826, "grad_norm": 5.944769859313965, "learning_rate": 4.782608695652174e-05, "loss": 0.0634, "step": 7800 }, { "epoch": 5.224080267558528, "grad_norm": 0.392313688993454, "learning_rate": 4.775919732441472e-05, "loss": 0.0522, "step": 7810 }, { "epoch": 5.230769230769231, "grad_norm": 3.464770793914795, "learning_rate": 4.76923076923077e-05, "loss": 0.0695, "step": 7820 }, { "epoch": 5.237458193979933, "grad_norm": 5.194563865661621, "learning_rate": 4.7625418060200674e-05, "loss": 0.047, "step": 7830 }, { "epoch": 5.2441471571906355, "grad_norm": 3.7293553352355957, "learning_rate": 4.755852842809365e-05, "loss": 0.0679, "step": 7840 }, { "epoch": 5.250836120401337, "grad_norm": 1.6341503858566284, "learning_rate": 4.7491638795986624e-05, "loss": 0.069, "step": 7850 }, { "epoch": 5.25752508361204, "grad_norm": 2.9764022827148438, "learning_rate": 4.74247491638796e-05, "loss": 0.056, "step": 7860 }, { "epoch": 5.264214046822742, "grad_norm": 0.9984621405601501, "learning_rate": 4.7357859531772574e-05, "loss": 0.0619, "step": 7870 }, { "epoch": 5.270903010033445, "grad_norm": 1.5726069211959839, "learning_rate": 4.7290969899665556e-05, "loss": 0.0554, "step": 7880 }, { "epoch": 5.277591973244147, "grad_norm": 2.6059932708740234, "learning_rate": 4.722408026755853e-05, "loss": 0.0486, "step": 7890 }, { "epoch": 5.2842809364548495, "grad_norm": 1.7960805892944336, "learning_rate": 4.715719063545151e-05, "loss": 0.0602, "step": 7900 }, { "epoch": 5.290969899665551, "grad_norm": 1.2459561824798584, "learning_rate": 4.709030100334448e-05, "loss": 0.0685, "step": 7910 }, { "epoch": 5.297658862876254, "grad_norm": 0.4798709750175476, "learning_rate": 4.7023411371237456e-05, "loss": 0.0456, "step": 7920 }, { "epoch": 5.304347826086957, "grad_norm": 1.6899269819259644, "learning_rate": 4.695652173913044e-05, "loss": 0.0856, "step": 7930 }, { "epoch": 5.311036789297659, "grad_norm": 4.1698899269104, "learning_rate": 4.688963210702341e-05, "loss": 0.071, "step": 7940 }, { "epoch": 5.317725752508361, "grad_norm": 3.4093782901763916, "learning_rate": 4.6822742474916394e-05, "loss": 0.0696, "step": 7950 }, { "epoch": 5.3244147157190636, "grad_norm": 5.086987018585205, "learning_rate": 4.675585284280937e-05, "loss": 0.0815, "step": 7960 }, { "epoch": 5.331103678929766, "grad_norm": 0.9084048867225647, "learning_rate": 4.6688963210702344e-05, "loss": 0.0613, "step": 7970 }, { "epoch": 5.337792642140468, "grad_norm": 1.4398747682571411, "learning_rate": 4.662207357859532e-05, "loss": 0.0737, "step": 7980 }, { "epoch": 5.34448160535117, "grad_norm": 1.1616789102554321, "learning_rate": 4.6555183946488294e-05, "loss": 0.0661, "step": 7990 }, { "epoch": 5.351170568561873, "grad_norm": 1.2757660150527954, "learning_rate": 4.6488294314381276e-05, "loss": 0.0581, "step": 8000 }, { "epoch": 5.357859531772576, "grad_norm": 0.2481812834739685, "learning_rate": 4.642140468227425e-05, "loss": 0.0695, "step": 8010 }, { "epoch": 5.364548494983278, "grad_norm": 0.8452910780906677, "learning_rate": 4.6354515050167226e-05, "loss": 0.058, "step": 8020 }, { "epoch": 5.3712374581939795, "grad_norm": 5.494908332824707, "learning_rate": 4.62876254180602e-05, "loss": 0.0755, "step": 8030 }, { "epoch": 5.377926421404682, "grad_norm": 3.610271692276001, "learning_rate": 4.6220735785953176e-05, "loss": 0.0704, "step": 8040 }, { "epoch": 5.384615384615385, "grad_norm": 0.33079513907432556, "learning_rate": 4.615384615384616e-05, "loss": 0.059, "step": 8050 }, { "epoch": 5.391304347826087, "grad_norm": 3.8127245903015137, "learning_rate": 4.608695652173913e-05, "loss": 0.0665, "step": 8060 }, { "epoch": 5.39799331103679, "grad_norm": 1.6623046398162842, "learning_rate": 4.602006688963211e-05, "loss": 0.0913, "step": 8070 }, { "epoch": 5.404682274247492, "grad_norm": 2.1431431770324707, "learning_rate": 4.595317725752509e-05, "loss": 0.0615, "step": 8080 }, { "epoch": 5.411371237458194, "grad_norm": 3.0918350219726562, "learning_rate": 4.5886287625418065e-05, "loss": 0.0647, "step": 8090 }, { "epoch": 5.418060200668896, "grad_norm": 0.3633984625339508, "learning_rate": 4.581939799331103e-05, "loss": 0.0587, "step": 8100 }, { "epoch": 5.424749163879599, "grad_norm": 0.8349478244781494, "learning_rate": 4.5752508361204015e-05, "loss": 0.0743, "step": 8110 }, { "epoch": 5.431438127090301, "grad_norm": 0.17242014408111572, "learning_rate": 4.568561872909699e-05, "loss": 0.0713, "step": 8120 }, { "epoch": 5.438127090301004, "grad_norm": 1.0146863460540771, "learning_rate": 4.561872909698997e-05, "loss": 0.0519, "step": 8130 }, { "epoch": 5.444816053511706, "grad_norm": 1.4058053493499756, "learning_rate": 4.555183946488295e-05, "loss": 0.057, "step": 8140 }, { "epoch": 5.451505016722408, "grad_norm": 2.4681694507598877, "learning_rate": 4.548494983277592e-05, "loss": 0.0603, "step": 8150 }, { "epoch": 5.45819397993311, "grad_norm": 0.7661647796630859, "learning_rate": 4.54180602006689e-05, "loss": 0.055, "step": 8160 }, { "epoch": 5.464882943143813, "grad_norm": 1.6556464433670044, "learning_rate": 4.535117056856187e-05, "loss": 0.061, "step": 8170 }, { "epoch": 5.471571906354515, "grad_norm": 1.1254479885101318, "learning_rate": 4.5284280936454854e-05, "loss": 0.0895, "step": 8180 }, { "epoch": 5.478260869565218, "grad_norm": 3.0038845539093018, "learning_rate": 4.521739130434783e-05, "loss": 0.0593, "step": 8190 }, { "epoch": 5.48494983277592, "grad_norm": 3.134263753890991, "learning_rate": 4.5150501672240804e-05, "loss": 0.0626, "step": 8200 }, { "epoch": 5.491638795986622, "grad_norm": 0.7873281836509705, "learning_rate": 4.5083612040133786e-05, "loss": 0.0601, "step": 8210 }, { "epoch": 5.498327759197324, "grad_norm": 2.841003894805908, "learning_rate": 4.5016722408026754e-05, "loss": 0.0562, "step": 8220 }, { "epoch": 5.505016722408027, "grad_norm": 3.942023992538452, "learning_rate": 4.4949832775919736e-05, "loss": 0.0816, "step": 8230 }, { "epoch": 5.511705685618729, "grad_norm": 0.525076687335968, "learning_rate": 4.488294314381271e-05, "loss": 0.0645, "step": 8240 }, { "epoch": 5.518394648829432, "grad_norm": 0.6077231764793396, "learning_rate": 4.4816053511705686e-05, "loss": 0.069, "step": 8250 }, { "epoch": 5.525083612040134, "grad_norm": 2.8950376510620117, "learning_rate": 4.474916387959867e-05, "loss": 0.0577, "step": 8260 }, { "epoch": 5.531772575250836, "grad_norm": 1.7112760543823242, "learning_rate": 4.468227424749164e-05, "loss": 0.0787, "step": 8270 }, { "epoch": 5.538461538461538, "grad_norm": 0.3955395221710205, "learning_rate": 4.461538461538462e-05, "loss": 0.0578, "step": 8280 }, { "epoch": 5.545150501672241, "grad_norm": 1.8074870109558105, "learning_rate": 4.454849498327759e-05, "loss": 0.0642, "step": 8290 }, { "epoch": 5.551839464882943, "grad_norm": 1.9293107986450195, "learning_rate": 4.448160535117057e-05, "loss": 0.0624, "step": 8300 }, { "epoch": 5.558528428093646, "grad_norm": 1.021624207496643, "learning_rate": 4.441471571906355e-05, "loss": 0.078, "step": 8310 }, { "epoch": 5.565217391304348, "grad_norm": 1.299368977546692, "learning_rate": 4.4347826086956525e-05, "loss": 0.07, "step": 8320 }, { "epoch": 5.5719063545150505, "grad_norm": 0.6169732809066772, "learning_rate": 4.4280936454849506e-05, "loss": 0.0629, "step": 8330 }, { "epoch": 5.578595317725752, "grad_norm": 1.1304540634155273, "learning_rate": 4.4214046822742475e-05, "loss": 0.052, "step": 8340 }, { "epoch": 5.585284280936455, "grad_norm": 2.0306649208068848, "learning_rate": 4.414715719063545e-05, "loss": 0.0772, "step": 8350 }, { "epoch": 5.591973244147157, "grad_norm": 2.0321860313415527, "learning_rate": 4.408026755852843e-05, "loss": 0.0586, "step": 8360 }, { "epoch": 5.59866220735786, "grad_norm": 2.1765122413635254, "learning_rate": 4.4013377926421407e-05, "loss": 0.0619, "step": 8370 }, { "epoch": 5.605351170568562, "grad_norm": 2.641141414642334, "learning_rate": 4.394648829431438e-05, "loss": 0.0693, "step": 8380 }, { "epoch": 5.6120401337792645, "grad_norm": 4.06684684753418, "learning_rate": 4.3879598662207363e-05, "loss": 0.076, "step": 8390 }, { "epoch": 5.618729096989966, "grad_norm": 1.448151707649231, "learning_rate": 4.381270903010034e-05, "loss": 0.0795, "step": 8400 }, { "epoch": 5.625418060200669, "grad_norm": 2.3511595726013184, "learning_rate": 4.3745819397993313e-05, "loss": 0.05, "step": 8410 }, { "epoch": 5.632107023411371, "grad_norm": 3.952510356903076, "learning_rate": 4.367892976588629e-05, "loss": 0.0676, "step": 8420 }, { "epoch": 5.638795986622074, "grad_norm": 2.635272979736328, "learning_rate": 4.3612040133779264e-05, "loss": 0.0663, "step": 8430 }, { "epoch": 5.645484949832776, "grad_norm": 2.334019184112549, "learning_rate": 4.3545150501672245e-05, "loss": 0.0704, "step": 8440 }, { "epoch": 5.6521739130434785, "grad_norm": 3.036574602127075, "learning_rate": 4.347826086956522e-05, "loss": 0.0623, "step": 8450 }, { "epoch": 5.65886287625418, "grad_norm": 3.2615692615509033, "learning_rate": 4.3411371237458195e-05, "loss": 0.0603, "step": 8460 }, { "epoch": 5.665551839464883, "grad_norm": 0.9925899505615234, "learning_rate": 4.334448160535117e-05, "loss": 0.0697, "step": 8470 }, { "epoch": 5.672240802675585, "grad_norm": 5.593775272369385, "learning_rate": 4.3277591973244145e-05, "loss": 0.056, "step": 8480 }, { "epoch": 5.678929765886288, "grad_norm": 2.722613573074341, "learning_rate": 4.321070234113713e-05, "loss": 0.0671, "step": 8490 }, { "epoch": 5.68561872909699, "grad_norm": 6.139028072357178, "learning_rate": 4.31438127090301e-05, "loss": 0.0705, "step": 8500 }, { "epoch": 5.6923076923076925, "grad_norm": 1.3252087831497192, "learning_rate": 4.3076923076923084e-05, "loss": 0.0599, "step": 8510 }, { "epoch": 5.698996655518394, "grad_norm": 5.210198402404785, "learning_rate": 4.301003344481606e-05, "loss": 0.0635, "step": 8520 }, { "epoch": 5.705685618729097, "grad_norm": 0.547988772392273, "learning_rate": 4.294314381270903e-05, "loss": 0.0511, "step": 8530 }, { "epoch": 5.712374581939799, "grad_norm": 2.0190672874450684, "learning_rate": 4.287625418060201e-05, "loss": 0.0677, "step": 8540 }, { "epoch": 5.719063545150502, "grad_norm": 1.9402360916137695, "learning_rate": 4.2809364548494984e-05, "loss": 0.0714, "step": 8550 }, { "epoch": 5.725752508361204, "grad_norm": 2.5025672912597656, "learning_rate": 4.2742474916387966e-05, "loss": 0.0701, "step": 8560 }, { "epoch": 5.7324414715719065, "grad_norm": 5.094533443450928, "learning_rate": 4.267558528428094e-05, "loss": 0.0584, "step": 8570 }, { "epoch": 5.739130434782608, "grad_norm": 3.5709147453308105, "learning_rate": 4.2608695652173916e-05, "loss": 0.0646, "step": 8580 }, { "epoch": 5.745819397993311, "grad_norm": 0.7281627655029297, "learning_rate": 4.254180602006689e-05, "loss": 0.0684, "step": 8590 }, { "epoch": 5.752508361204013, "grad_norm": 0.7778987288475037, "learning_rate": 4.2474916387959866e-05, "loss": 0.059, "step": 8600 }, { "epoch": 5.759197324414716, "grad_norm": 0.7435514330863953, "learning_rate": 4.240802675585284e-05, "loss": 0.0686, "step": 8610 }, { "epoch": 5.765886287625418, "grad_norm": 5.178036212921143, "learning_rate": 4.234113712374582e-05, "loss": 0.0583, "step": 8620 }, { "epoch": 5.7725752508361206, "grad_norm": 1.6079000234603882, "learning_rate": 4.22742474916388e-05, "loss": 0.0469, "step": 8630 }, { "epoch": 5.7792642140468224, "grad_norm": 0.29372864961624146, "learning_rate": 4.220735785953178e-05, "loss": 0.0573, "step": 8640 }, { "epoch": 5.785953177257525, "grad_norm": 1.2265444993972778, "learning_rate": 4.214046822742475e-05, "loss": 0.0613, "step": 8650 }, { "epoch": 5.792642140468227, "grad_norm": 4.454895496368408, "learning_rate": 4.207357859531772e-05, "loss": 0.0676, "step": 8660 }, { "epoch": 5.79933110367893, "grad_norm": 1.5004006624221802, "learning_rate": 4.2006688963210705e-05, "loss": 0.0631, "step": 8670 }, { "epoch": 5.806020066889632, "grad_norm": 3.0780856609344482, "learning_rate": 4.193979933110368e-05, "loss": 0.0586, "step": 8680 }, { "epoch": 5.812709030100335, "grad_norm": 0.17715483903884888, "learning_rate": 4.187290969899666e-05, "loss": 0.0616, "step": 8690 }, { "epoch": 5.8193979933110365, "grad_norm": 0.9322307109832764, "learning_rate": 4.180602006688964e-05, "loss": 0.0733, "step": 8700 }, { "epoch": 5.826086956521739, "grad_norm": 1.79049551486969, "learning_rate": 4.1739130434782605e-05, "loss": 0.0733, "step": 8710 }, { "epoch": 5.832775919732441, "grad_norm": 3.6763522624969482, "learning_rate": 4.167224080267559e-05, "loss": 0.0568, "step": 8720 }, { "epoch": 5.839464882943144, "grad_norm": 0.3415290117263794, "learning_rate": 4.160535117056856e-05, "loss": 0.0544, "step": 8730 }, { "epoch": 5.846153846153846, "grad_norm": 0.7277694940567017, "learning_rate": 4.1538461538461544e-05, "loss": 0.0623, "step": 8740 }, { "epoch": 5.852842809364549, "grad_norm": 2.3001549243927, "learning_rate": 4.147157190635452e-05, "loss": 0.0552, "step": 8750 }, { "epoch": 5.8595317725752505, "grad_norm": 1.6906051635742188, "learning_rate": 4.1404682274247494e-05, "loss": 0.0733, "step": 8760 }, { "epoch": 5.866220735785953, "grad_norm": 1.930910348892212, "learning_rate": 4.133779264214047e-05, "loss": 0.0523, "step": 8770 }, { "epoch": 5.872909698996655, "grad_norm": 0.4897262454032898, "learning_rate": 4.1270903010033444e-05, "loss": 0.0646, "step": 8780 }, { "epoch": 5.879598662207358, "grad_norm": 1.7537932395935059, "learning_rate": 4.1204013377926426e-05, "loss": 0.0591, "step": 8790 }, { "epoch": 5.88628762541806, "grad_norm": 0.8843321800231934, "learning_rate": 4.11371237458194e-05, "loss": 0.0763, "step": 8800 }, { "epoch": 5.892976588628763, "grad_norm": 3.215991735458374, "learning_rate": 4.1070234113712376e-05, "loss": 0.048, "step": 8810 }, { "epoch": 5.8996655518394645, "grad_norm": 1.1658564805984497, "learning_rate": 4.100334448160536e-05, "loss": 0.0551, "step": 8820 }, { "epoch": 5.906354515050167, "grad_norm": 7.578488826751709, "learning_rate": 4.0936454849498326e-05, "loss": 0.0715, "step": 8830 }, { "epoch": 5.913043478260869, "grad_norm": 1.8251421451568604, "learning_rate": 4.086956521739131e-05, "loss": 0.0519, "step": 8840 }, { "epoch": 5.919732441471572, "grad_norm": 1.7481938600540161, "learning_rate": 4.080267558528428e-05, "loss": 0.0492, "step": 8850 }, { "epoch": 5.926421404682275, "grad_norm": 0.3837336003780365, "learning_rate": 4.073578595317726e-05, "loss": 0.0636, "step": 8860 }, { "epoch": 5.933110367892977, "grad_norm": 1.3640086650848389, "learning_rate": 4.066889632107024e-05, "loss": 0.0443, "step": 8870 }, { "epoch": 5.9397993311036785, "grad_norm": 1.6527479887008667, "learning_rate": 4.0602006688963214e-05, "loss": 0.064, "step": 8880 }, { "epoch": 5.946488294314381, "grad_norm": 0.7127817869186401, "learning_rate": 4.053511705685619e-05, "loss": 0.079, "step": 8890 }, { "epoch": 5.953177257525084, "grad_norm": 0.5994347929954529, "learning_rate": 4.0468227424749165e-05, "loss": 0.0585, "step": 8900 }, { "epoch": 5.959866220735786, "grad_norm": 2.5335919857025146, "learning_rate": 4.040133779264214e-05, "loss": 0.0588, "step": 8910 }, { "epoch": 5.966555183946488, "grad_norm": 2.706871747970581, "learning_rate": 4.033444816053512e-05, "loss": 0.0672, "step": 8920 }, { "epoch": 5.973244147157191, "grad_norm": 5.2530107498168945, "learning_rate": 4.0267558528428096e-05, "loss": 0.0659, "step": 8930 }, { "epoch": 5.979933110367893, "grad_norm": 1.829155445098877, "learning_rate": 4.020066889632107e-05, "loss": 0.07, "step": 8940 }, { "epoch": 5.986622073578595, "grad_norm": 1.090369462966919, "learning_rate": 4.0133779264214046e-05, "loss": 0.0542, "step": 8950 }, { "epoch": 5.993311036789297, "grad_norm": 2.2025110721588135, "learning_rate": 4.006688963210702e-05, "loss": 0.0644, "step": 8960 }, { "epoch": 6.0, "grad_norm": 1.740294337272644, "learning_rate": 4e-05, "loss": 0.0677, "step": 8970 }, { "epoch": 6.0, "eval_loss": 0.07782479375600815, "eval_mse": 0.07782479375600815, "eval_runtime": 221.7684, "eval_samples_per_second": 13.483, "eval_steps_per_second": 1.686, "step": 8970 }, { "epoch": 6.006688963210703, "grad_norm": 1.5802152156829834, "learning_rate": 3.993311036789298e-05, "loss": 0.0508, "step": 8980 }, { "epoch": 6.013377926421405, "grad_norm": 3.263096332550049, "learning_rate": 3.986622073578595e-05, "loss": 0.0465, "step": 8990 }, { "epoch": 6.0200668896321075, "grad_norm": 3.2240211963653564, "learning_rate": 3.9799331103678935e-05, "loss": 0.0614, "step": 9000 }, { "epoch": 6.026755852842809, "grad_norm": 1.102103352546692, "learning_rate": 3.973244147157191e-05, "loss": 0.0621, "step": 9010 }, { "epoch": 6.033444816053512, "grad_norm": 2.7606148719787598, "learning_rate": 3.9665551839464885e-05, "loss": 0.0633, "step": 9020 }, { "epoch": 6.040133779264214, "grad_norm": 0.45631539821624756, "learning_rate": 3.959866220735786e-05, "loss": 0.0574, "step": 9030 }, { "epoch": 6.046822742474917, "grad_norm": 6.115387439727783, "learning_rate": 3.9531772575250835e-05, "loss": 0.0597, "step": 9040 }, { "epoch": 6.053511705685619, "grad_norm": 4.481198787689209, "learning_rate": 3.946488294314382e-05, "loss": 0.0632, "step": 9050 }, { "epoch": 6.0602006688963215, "grad_norm": 0.34394732117652893, "learning_rate": 3.939799331103679e-05, "loss": 0.0436, "step": 9060 }, { "epoch": 6.066889632107023, "grad_norm": 2.672247886657715, "learning_rate": 3.933110367892977e-05, "loss": 0.0429, "step": 9070 }, { "epoch": 6.073578595317726, "grad_norm": 5.2561869621276855, "learning_rate": 3.926421404682274e-05, "loss": 0.0498, "step": 9080 }, { "epoch": 6.080267558528428, "grad_norm": 1.9382920265197754, "learning_rate": 3.919732441471572e-05, "loss": 0.0578, "step": 9090 }, { "epoch": 6.086956521739131, "grad_norm": 1.8460969924926758, "learning_rate": 3.91304347826087e-05, "loss": 0.0486, "step": 9100 }, { "epoch": 6.093645484949833, "grad_norm": 1.0311009883880615, "learning_rate": 3.9063545150501674e-05, "loss": 0.0343, "step": 9110 }, { "epoch": 6.1003344481605355, "grad_norm": 0.8744246959686279, "learning_rate": 3.899665551839465e-05, "loss": 0.0577, "step": 9120 }, { "epoch": 6.107023411371237, "grad_norm": 1.2633397579193115, "learning_rate": 3.892976588628763e-05, "loss": 0.0464, "step": 9130 }, { "epoch": 6.11371237458194, "grad_norm": 0.67624431848526, "learning_rate": 3.88628762541806e-05, "loss": 0.0373, "step": 9140 }, { "epoch": 6.120401337792642, "grad_norm": 3.1662580966949463, "learning_rate": 3.879598662207358e-05, "loss": 0.0545, "step": 9150 }, { "epoch": 6.127090301003345, "grad_norm": 0.6137005686759949, "learning_rate": 3.8729096989966556e-05, "loss": 0.0501, "step": 9160 }, { "epoch": 6.133779264214047, "grad_norm": 1.7040530443191528, "learning_rate": 3.866220735785953e-05, "loss": 0.0545, "step": 9170 }, { "epoch": 6.1404682274247495, "grad_norm": 2.090524196624756, "learning_rate": 3.859531772575251e-05, "loss": 0.0441, "step": 9180 }, { "epoch": 6.147157190635451, "grad_norm": 2.0296332836151123, "learning_rate": 3.852842809364549e-05, "loss": 0.0662, "step": 9190 }, { "epoch": 6.153846153846154, "grad_norm": 0.7337088584899902, "learning_rate": 3.846153846153846e-05, "loss": 0.0434, "step": 9200 }, { "epoch": 6.160535117056856, "grad_norm": 0.595910370349884, "learning_rate": 3.839464882943144e-05, "loss": 0.0655, "step": 9210 }, { "epoch": 6.167224080267559, "grad_norm": 1.5148868560791016, "learning_rate": 3.832775919732441e-05, "loss": 0.0555, "step": 9220 }, { "epoch": 6.173913043478261, "grad_norm": 1.5558432340621948, "learning_rate": 3.8260869565217395e-05, "loss": 0.0531, "step": 9230 }, { "epoch": 6.1806020066889635, "grad_norm": 3.5125138759613037, "learning_rate": 3.819397993311037e-05, "loss": 0.046, "step": 9240 }, { "epoch": 6.187290969899665, "grad_norm": 2.2275428771972656, "learning_rate": 3.812709030100335e-05, "loss": 0.0516, "step": 9250 }, { "epoch": 6.193979933110368, "grad_norm": 2.8403425216674805, "learning_rate": 3.806020066889632e-05, "loss": 0.066, "step": 9260 }, { "epoch": 6.20066889632107, "grad_norm": 2.0408856868743896, "learning_rate": 3.7993311036789295e-05, "loss": 0.053, "step": 9270 }, { "epoch": 6.207357859531773, "grad_norm": 1.795935869216919, "learning_rate": 3.792642140468228e-05, "loss": 0.0492, "step": 9280 }, { "epoch": 6.214046822742475, "grad_norm": 2.151693105697632, "learning_rate": 3.785953177257525e-05, "loss": 0.043, "step": 9290 }, { "epoch": 6.2207357859531776, "grad_norm": 2.5936758518218994, "learning_rate": 3.7792642140468233e-05, "loss": 0.0528, "step": 9300 }, { "epoch": 6.2274247491638794, "grad_norm": 1.5297855138778687, "learning_rate": 3.772575250836121e-05, "loss": 0.0465, "step": 9310 }, { "epoch": 6.234113712374582, "grad_norm": 3.3159210681915283, "learning_rate": 3.7658862876254184e-05, "loss": 0.0476, "step": 9320 }, { "epoch": 6.240802675585284, "grad_norm": 2.997654676437378, "learning_rate": 3.759197324414716e-05, "loss": 0.0573, "step": 9330 }, { "epoch": 6.247491638795987, "grad_norm": 0.6075377464294434, "learning_rate": 3.7525083612040134e-05, "loss": 0.0588, "step": 9340 }, { "epoch": 6.254180602006689, "grad_norm": 2.5842342376708984, "learning_rate": 3.745819397993311e-05, "loss": 0.0637, "step": 9350 }, { "epoch": 6.260869565217392, "grad_norm": 4.031673431396484, "learning_rate": 3.739130434782609e-05, "loss": 0.0525, "step": 9360 }, { "epoch": 6.2675585284280935, "grad_norm": 2.174769401550293, "learning_rate": 3.7324414715719065e-05, "loss": 0.0368, "step": 9370 }, { "epoch": 6.274247491638796, "grad_norm": 3.260265588760376, "learning_rate": 3.725752508361204e-05, "loss": 0.0483, "step": 9380 }, { "epoch": 6.280936454849498, "grad_norm": 1.3505406379699707, "learning_rate": 3.7190635451505016e-05, "loss": 0.0521, "step": 9390 }, { "epoch": 6.287625418060201, "grad_norm": 1.0265694856643677, "learning_rate": 3.712374581939799e-05, "loss": 0.0726, "step": 9400 }, { "epoch": 6.294314381270903, "grad_norm": 0.7887771129608154, "learning_rate": 3.705685618729097e-05, "loss": 0.0679, "step": 9410 }, { "epoch": 6.301003344481606, "grad_norm": 1.9342752695083618, "learning_rate": 3.698996655518395e-05, "loss": 0.0562, "step": 9420 }, { "epoch": 6.3076923076923075, "grad_norm": 0.5005589723587036, "learning_rate": 3.692307692307693e-05, "loss": 0.0629, "step": 9430 }, { "epoch": 6.31438127090301, "grad_norm": 0.8035380244255066, "learning_rate": 3.6856187290969904e-05, "loss": 0.0565, "step": 9440 }, { "epoch": 6.321070234113712, "grad_norm": 2.018810272216797, "learning_rate": 3.678929765886287e-05, "loss": 0.064, "step": 9450 }, { "epoch": 6.327759197324415, "grad_norm": 0.6776765584945679, "learning_rate": 3.6722408026755854e-05, "loss": 0.0479, "step": 9460 }, { "epoch": 6.334448160535117, "grad_norm": 2.903470754623413, "learning_rate": 3.665551839464883e-05, "loss": 0.0626, "step": 9470 }, { "epoch": 6.34113712374582, "grad_norm": 1.5163041353225708, "learning_rate": 3.658862876254181e-05, "loss": 0.0557, "step": 9480 }, { "epoch": 6.3478260869565215, "grad_norm": 2.5668625831604004, "learning_rate": 3.6521739130434786e-05, "loss": 0.0504, "step": 9490 }, { "epoch": 6.354515050167224, "grad_norm": 1.1242213249206543, "learning_rate": 3.645484949832776e-05, "loss": 0.0471, "step": 9500 }, { "epoch": 6.361204013377926, "grad_norm": 0.8804264068603516, "learning_rate": 3.6387959866220736e-05, "loss": 0.0535, "step": 9510 }, { "epoch": 6.367892976588629, "grad_norm": 0.8456764221191406, "learning_rate": 3.632107023411371e-05, "loss": 0.0598, "step": 9520 }, { "epoch": 6.374581939799331, "grad_norm": 1.6663057804107666, "learning_rate": 3.625418060200669e-05, "loss": 0.0537, "step": 9530 }, { "epoch": 6.381270903010034, "grad_norm": 0.485980361700058, "learning_rate": 3.618729096989967e-05, "loss": 0.0498, "step": 9540 }, { "epoch": 6.3879598662207355, "grad_norm": 2.3799917697906494, "learning_rate": 3.612040133779264e-05, "loss": 0.0568, "step": 9550 }, { "epoch": 6.394648829431438, "grad_norm": 1.0425069332122803, "learning_rate": 3.6053511705685625e-05, "loss": 0.0441, "step": 9560 }, { "epoch": 6.40133779264214, "grad_norm": 3.5270988941192627, "learning_rate": 3.598662207357859e-05, "loss": 0.0502, "step": 9570 }, { "epoch": 6.408026755852843, "grad_norm": 6.216033458709717, "learning_rate": 3.5919732441471575e-05, "loss": 0.052, "step": 9580 }, { "epoch": 6.414715719063545, "grad_norm": 2.594477653503418, "learning_rate": 3.585284280936455e-05, "loss": 0.0553, "step": 9590 }, { "epoch": 6.421404682274248, "grad_norm": 3.850069999694824, "learning_rate": 3.5785953177257525e-05, "loss": 0.0531, "step": 9600 }, { "epoch": 6.4280936454849495, "grad_norm": 2.2683510780334473, "learning_rate": 3.571906354515051e-05, "loss": 0.0514, "step": 9610 }, { "epoch": 6.434782608695652, "grad_norm": 2.9711992740631104, "learning_rate": 3.565217391304348e-05, "loss": 0.0505, "step": 9620 }, { "epoch": 6.441471571906354, "grad_norm": 3.0900185108184814, "learning_rate": 3.558528428093646e-05, "loss": 0.0473, "step": 9630 }, { "epoch": 6.448160535117057, "grad_norm": 2.725559711456299, "learning_rate": 3.551839464882943e-05, "loss": 0.0599, "step": 9640 }, { "epoch": 6.454849498327759, "grad_norm": 0.8586891293525696, "learning_rate": 3.545150501672241e-05, "loss": 0.0574, "step": 9650 }, { "epoch": 6.461538461538462, "grad_norm": 0.4257071912288666, "learning_rate": 3.538461538461539e-05, "loss": 0.0521, "step": 9660 }, { "epoch": 6.468227424749164, "grad_norm": 2.654646635055542, "learning_rate": 3.5317725752508364e-05, "loss": 0.0488, "step": 9670 }, { "epoch": 6.474916387959866, "grad_norm": 1.2186239957809448, "learning_rate": 3.525083612040134e-05, "loss": 0.0558, "step": 9680 }, { "epoch": 6.481605351170568, "grad_norm": 0.5296560525894165, "learning_rate": 3.5183946488294314e-05, "loss": 0.0508, "step": 9690 }, { "epoch": 6.488294314381271, "grad_norm": 4.062180995941162, "learning_rate": 3.511705685618729e-05, "loss": 0.0634, "step": 9700 }, { "epoch": 6.494983277591973, "grad_norm": 3.340278148651123, "learning_rate": 3.505016722408027e-05, "loss": 0.0442, "step": 9710 }, { "epoch": 6.501672240802676, "grad_norm": 1.239185094833374, "learning_rate": 3.4983277591973246e-05, "loss": 0.0546, "step": 9720 }, { "epoch": 6.508361204013378, "grad_norm": 0.5998998880386353, "learning_rate": 3.491638795986622e-05, "loss": 0.0544, "step": 9730 }, { "epoch": 6.51505016722408, "grad_norm": 1.277424931526184, "learning_rate": 3.48494983277592e-05, "loss": 0.0596, "step": 9740 }, { "epoch": 6.521739130434782, "grad_norm": 0.9304616451263428, "learning_rate": 3.478260869565218e-05, "loss": 0.0641, "step": 9750 }, { "epoch": 6.528428093645485, "grad_norm": 0.8859513998031616, "learning_rate": 3.471571906354515e-05, "loss": 0.0566, "step": 9760 }, { "epoch": 6.535117056856187, "grad_norm": 2.800158977508545, "learning_rate": 3.464882943143813e-05, "loss": 0.0525, "step": 9770 }, { "epoch": 6.54180602006689, "grad_norm": 1.4394855499267578, "learning_rate": 3.45819397993311e-05, "loss": 0.0488, "step": 9780 }, { "epoch": 6.548494983277592, "grad_norm": 0.5787505507469177, "learning_rate": 3.4515050167224085e-05, "loss": 0.0487, "step": 9790 }, { "epoch": 6.555183946488294, "grad_norm": 1.568294644355774, "learning_rate": 3.444816053511706e-05, "loss": 0.06, "step": 9800 }, { "epoch": 6.561872909698996, "grad_norm": 2.2750236988067627, "learning_rate": 3.4381270903010035e-05, "loss": 0.0546, "step": 9810 }, { "epoch": 6.568561872909699, "grad_norm": 6.165664196014404, "learning_rate": 3.431438127090301e-05, "loss": 0.0459, "step": 9820 }, { "epoch": 6.575250836120401, "grad_norm": 0.3739396631717682, "learning_rate": 3.4247491638795985e-05, "loss": 0.0477, "step": 9830 }, { "epoch": 6.581939799331104, "grad_norm": 1.5687717199325562, "learning_rate": 3.4180602006688966e-05, "loss": 0.0552, "step": 9840 }, { "epoch": 6.588628762541806, "grad_norm": 4.11144495010376, "learning_rate": 3.411371237458194e-05, "loss": 0.0594, "step": 9850 }, { "epoch": 6.595317725752508, "grad_norm": 0.7815970182418823, "learning_rate": 3.4046822742474917e-05, "loss": 0.0615, "step": 9860 }, { "epoch": 6.602006688963211, "grad_norm": 1.0173360109329224, "learning_rate": 3.39799331103679e-05, "loss": 0.0693, "step": 9870 }, { "epoch": 6.608695652173913, "grad_norm": 2.378054618835449, "learning_rate": 3.3913043478260867e-05, "loss": 0.0643, "step": 9880 }, { "epoch": 6.615384615384615, "grad_norm": 2.3293285369873047, "learning_rate": 3.384615384615385e-05, "loss": 0.0445, "step": 9890 }, { "epoch": 6.622073578595318, "grad_norm": 4.405140399932861, "learning_rate": 3.3779264214046823e-05, "loss": 0.0604, "step": 9900 }, { "epoch": 6.6287625418060205, "grad_norm": 0.9924785494804382, "learning_rate": 3.37123745819398e-05, "loss": 0.0399, "step": 9910 }, { "epoch": 6.635451505016722, "grad_norm": 0.6336735486984253, "learning_rate": 3.364548494983278e-05, "loss": 0.0485, "step": 9920 }, { "epoch": 6.642140468227424, "grad_norm": 3.0844953060150146, "learning_rate": 3.3578595317725755e-05, "loss": 0.0421, "step": 9930 }, { "epoch": 6.648829431438127, "grad_norm": 0.759816586971283, "learning_rate": 3.351170568561873e-05, "loss": 0.0551, "step": 9940 }, { "epoch": 6.65551839464883, "grad_norm": 2.6838886737823486, "learning_rate": 3.3444816053511705e-05, "loss": 0.051, "step": 9950 }, { "epoch": 6.662207357859532, "grad_norm": 1.9430067539215088, "learning_rate": 3.337792642140468e-05, "loss": 0.0404, "step": 9960 }, { "epoch": 6.668896321070234, "grad_norm": 0.7252715229988098, "learning_rate": 3.331103678929766e-05, "loss": 0.0556, "step": 9970 }, { "epoch": 6.6755852842809364, "grad_norm": 1.3465676307678223, "learning_rate": 3.324414715719064e-05, "loss": 0.0617, "step": 9980 }, { "epoch": 6.682274247491639, "grad_norm": 1.4243336915969849, "learning_rate": 3.317725752508362e-05, "loss": 0.0561, "step": 9990 }, { "epoch": 6.688963210702341, "grad_norm": 0.5687856078147888, "learning_rate": 3.311036789297659e-05, "loss": 0.0505, "step": 10000 }, { "epoch": 6.695652173913043, "grad_norm": 1.7072571516036987, "learning_rate": 3.304347826086956e-05, "loss": 0.0536, "step": 10010 }, { "epoch": 6.702341137123746, "grad_norm": 1.2271828651428223, "learning_rate": 3.2976588628762544e-05, "loss": 0.0457, "step": 10020 }, { "epoch": 6.709030100334449, "grad_norm": 2.639836311340332, "learning_rate": 3.290969899665552e-05, "loss": 0.0657, "step": 10030 }, { "epoch": 6.7157190635451505, "grad_norm": 0.9130740761756897, "learning_rate": 3.28428093645485e-05, "loss": 0.0394, "step": 10040 }, { "epoch": 6.722408026755852, "grad_norm": 2.828885555267334, "learning_rate": 3.2775919732441476e-05, "loss": 0.052, "step": 10050 }, { "epoch": 6.729096989966555, "grad_norm": 6.914234638214111, "learning_rate": 3.270903010033445e-05, "loss": 0.0664, "step": 10060 }, { "epoch": 6.735785953177258, "grad_norm": 3.1282074451446533, "learning_rate": 3.2642140468227426e-05, "loss": 0.0642, "step": 10070 }, { "epoch": 6.74247491638796, "grad_norm": 1.06096351146698, "learning_rate": 3.25752508361204e-05, "loss": 0.0625, "step": 10080 }, { "epoch": 6.749163879598662, "grad_norm": 0.637898325920105, "learning_rate": 3.250836120401338e-05, "loss": 0.0521, "step": 10090 }, { "epoch": 6.7558528428093645, "grad_norm": 1.6803759336471558, "learning_rate": 3.244147157190636e-05, "loss": 0.0593, "step": 10100 }, { "epoch": 6.762541806020067, "grad_norm": 1.0126163959503174, "learning_rate": 3.237458193979933e-05, "loss": 0.0509, "step": 10110 }, { "epoch": 6.769230769230769, "grad_norm": 1.9007771015167236, "learning_rate": 3.230769230769231e-05, "loss": 0.051, "step": 10120 }, { "epoch": 6.775919732441472, "grad_norm": 1.0006183385849, "learning_rate": 3.224080267558528e-05, "loss": 0.0452, "step": 10130 }, { "epoch": 6.782608695652174, "grad_norm": 1.4270817041397095, "learning_rate": 3.217391304347826e-05, "loss": 0.0585, "step": 10140 }, { "epoch": 6.789297658862877, "grad_norm": 1.6937892436981201, "learning_rate": 3.210702341137124e-05, "loss": 0.0662, "step": 10150 }, { "epoch": 6.7959866220735785, "grad_norm": 1.8574788570404053, "learning_rate": 3.2040133779264215e-05, "loss": 0.0686, "step": 10160 }, { "epoch": 6.802675585284281, "grad_norm": 3.8167994022369385, "learning_rate": 3.19732441471572e-05, "loss": 0.0611, "step": 10170 }, { "epoch": 6.809364548494983, "grad_norm": 0.6884555816650391, "learning_rate": 3.190635451505017e-05, "loss": 0.0609, "step": 10180 }, { "epoch": 6.816053511705686, "grad_norm": 2.10984206199646, "learning_rate": 3.183946488294314e-05, "loss": 0.0542, "step": 10190 }, { "epoch": 6.822742474916388, "grad_norm": 1.2238664627075195, "learning_rate": 3.177257525083612e-05, "loss": 0.0501, "step": 10200 }, { "epoch": 6.829431438127091, "grad_norm": 1.3676847219467163, "learning_rate": 3.17056856187291e-05, "loss": 0.0494, "step": 10210 }, { "epoch": 6.8361204013377925, "grad_norm": 1.1930272579193115, "learning_rate": 3.163879598662208e-05, "loss": 0.0693, "step": 10220 }, { "epoch": 6.842809364548495, "grad_norm": 2.3715999126434326, "learning_rate": 3.1571906354515054e-05, "loss": 0.0476, "step": 10230 }, { "epoch": 6.849498327759197, "grad_norm": 4.478176593780518, "learning_rate": 3.150501672240803e-05, "loss": 0.0526, "step": 10240 }, { "epoch": 6.8561872909699, "grad_norm": 3.742927074432373, "learning_rate": 3.1438127090301004e-05, "loss": 0.0522, "step": 10250 }, { "epoch": 6.862876254180602, "grad_norm": 0.9782775044441223, "learning_rate": 3.137123745819398e-05, "loss": 0.0553, "step": 10260 }, { "epoch": 6.869565217391305, "grad_norm": 1.5323350429534912, "learning_rate": 3.130434782608696e-05, "loss": 0.0704, "step": 10270 }, { "epoch": 6.8762541806020065, "grad_norm": 3.9432942867279053, "learning_rate": 3.1237458193979936e-05, "loss": 0.0506, "step": 10280 }, { "epoch": 6.882943143812709, "grad_norm": 5.13473653793335, "learning_rate": 3.117056856187291e-05, "loss": 0.0399, "step": 10290 }, { "epoch": 6.889632107023411, "grad_norm": 0.9223735928535461, "learning_rate": 3.110367892976589e-05, "loss": 0.0368, "step": 10300 }, { "epoch": 6.896321070234114, "grad_norm": 2.1676201820373535, "learning_rate": 3.103678929765886e-05, "loss": 0.0462, "step": 10310 }, { "epoch": 6.903010033444816, "grad_norm": 2.290289878845215, "learning_rate": 3.096989966555184e-05, "loss": 0.0502, "step": 10320 }, { "epoch": 6.909698996655519, "grad_norm": 2.9174978733062744, "learning_rate": 3.090301003344482e-05, "loss": 0.0475, "step": 10330 }, { "epoch": 6.916387959866221, "grad_norm": 2.0384767055511475, "learning_rate": 3.083612040133779e-05, "loss": 0.0651, "step": 10340 }, { "epoch": 6.923076923076923, "grad_norm": 2.338029384613037, "learning_rate": 3.0769230769230774e-05, "loss": 0.053, "step": 10350 }, { "epoch": 6.929765886287625, "grad_norm": 1.1559053659439087, "learning_rate": 3.070234113712375e-05, "loss": 0.0635, "step": 10360 }, { "epoch": 6.936454849498328, "grad_norm": 5.035431861877441, "learning_rate": 3.0635451505016724e-05, "loss": 0.0644, "step": 10370 }, { "epoch": 6.94314381270903, "grad_norm": 1.137031078338623, "learning_rate": 3.05685618729097e-05, "loss": 0.0476, "step": 10380 }, { "epoch": 6.949832775919733, "grad_norm": 2.5256316661834717, "learning_rate": 3.0501672240802674e-05, "loss": 0.0671, "step": 10390 }, { "epoch": 6.956521739130435, "grad_norm": 1.1378544569015503, "learning_rate": 3.0434782608695656e-05, "loss": 0.0548, "step": 10400 }, { "epoch": 6.963210702341137, "grad_norm": 0.49568507075309753, "learning_rate": 3.036789297658863e-05, "loss": 0.0563, "step": 10410 }, { "epoch": 6.969899665551839, "grad_norm": 2.079551935195923, "learning_rate": 3.0301003344481603e-05, "loss": 0.0547, "step": 10420 }, { "epoch": 6.976588628762542, "grad_norm": 1.71834135055542, "learning_rate": 3.0234113712374585e-05, "loss": 0.0644, "step": 10430 }, { "epoch": 6.983277591973244, "grad_norm": 1.868596076965332, "learning_rate": 3.016722408026756e-05, "loss": 0.0585, "step": 10440 }, { "epoch": 6.989966555183947, "grad_norm": 2.3574764728546143, "learning_rate": 3.0100334448160538e-05, "loss": 0.051, "step": 10450 }, { "epoch": 6.996655518394649, "grad_norm": 3.6523358821868896, "learning_rate": 3.0033444816053513e-05, "loss": 0.0553, "step": 10460 }, { "epoch": 7.0, "eval_loss": 0.08115968108177185, "eval_mse": 0.08115968108177185, "eval_runtime": 226.6126, "eval_samples_per_second": 13.194, "eval_steps_per_second": 1.65, "step": 10465 }, { "epoch": 7.003344481605351, "grad_norm": 1.6401000022888184, "learning_rate": 2.9966555183946488e-05, "loss": 0.0587, "step": 10470 }, { "epoch": 7.010033444816053, "grad_norm": 0.4730532467365265, "learning_rate": 2.9899665551839467e-05, "loss": 0.0344, "step": 10480 }, { "epoch": 7.016722408026756, "grad_norm": 3.996577501296997, "learning_rate": 2.9832775919732442e-05, "loss": 0.0429, "step": 10490 }, { "epoch": 7.023411371237458, "grad_norm": 2.086496591567993, "learning_rate": 2.9765886287625424e-05, "loss": 0.0382, "step": 10500 }, { "epoch": 7.030100334448161, "grad_norm": 0.7093859910964966, "learning_rate": 2.9698996655518395e-05, "loss": 0.0539, "step": 10510 }, { "epoch": 7.036789297658863, "grad_norm": 6.086569786071777, "learning_rate": 2.963210702341137e-05, "loss": 0.0519, "step": 10520 }, { "epoch": 7.043478260869565, "grad_norm": 2.9343221187591553, "learning_rate": 2.9565217391304352e-05, "loss": 0.0402, "step": 10530 }, { "epoch": 7.050167224080267, "grad_norm": 1.3587912321090698, "learning_rate": 2.9498327759197324e-05, "loss": 0.0328, "step": 10540 }, { "epoch": 7.05685618729097, "grad_norm": 0.7912111878395081, "learning_rate": 2.9431438127090305e-05, "loss": 0.0397, "step": 10550 }, { "epoch": 7.063545150501672, "grad_norm": 1.5741112232208252, "learning_rate": 2.936454849498328e-05, "loss": 0.0342, "step": 10560 }, { "epoch": 7.070234113712375, "grad_norm": 0.40685224533081055, "learning_rate": 2.9297658862876256e-05, "loss": 0.0286, "step": 10570 }, { "epoch": 7.076923076923077, "grad_norm": 0.5212937593460083, "learning_rate": 2.9230769230769234e-05, "loss": 0.0405, "step": 10580 }, { "epoch": 7.083612040133779, "grad_norm": 1.1463117599487305, "learning_rate": 2.916387959866221e-05, "loss": 0.0367, "step": 10590 }, { "epoch": 7.090301003344481, "grad_norm": 0.6308630108833313, "learning_rate": 2.9096989966555184e-05, "loss": 0.034, "step": 10600 }, { "epoch": 7.096989966555184, "grad_norm": 0.5892065763473511, "learning_rate": 2.9030100334448162e-05, "loss": 0.0349, "step": 10610 }, { "epoch": 7.103678929765886, "grad_norm": 2.7921645641326904, "learning_rate": 2.8963210702341137e-05, "loss": 0.0467, "step": 10620 }, { "epoch": 7.110367892976589, "grad_norm": 0.6055333018302917, "learning_rate": 2.8896321070234116e-05, "loss": 0.0439, "step": 10630 }, { "epoch": 7.117056856187291, "grad_norm": 0.6409103274345398, "learning_rate": 2.882943143812709e-05, "loss": 0.0423, "step": 10640 }, { "epoch": 7.1237458193979935, "grad_norm": 2.2501044273376465, "learning_rate": 2.8762541806020066e-05, "loss": 0.0344, "step": 10650 }, { "epoch": 7.130434782608695, "grad_norm": 0.6633714437484741, "learning_rate": 2.8695652173913044e-05, "loss": 0.0447, "step": 10660 }, { "epoch": 7.137123745819398, "grad_norm": 4.393712043762207, "learning_rate": 2.862876254180602e-05, "loss": 0.0338, "step": 10670 }, { "epoch": 7.1438127090301, "grad_norm": 2.0332045555114746, "learning_rate": 2.8561872909699e-05, "loss": 0.0417, "step": 10680 }, { "epoch": 7.150501672240803, "grad_norm": 2.332609176635742, "learning_rate": 2.8494983277591973e-05, "loss": 0.0423, "step": 10690 }, { "epoch": 7.157190635451505, "grad_norm": 1.8318698406219482, "learning_rate": 2.8428093645484948e-05, "loss": 0.0381, "step": 10700 }, { "epoch": 7.1638795986622075, "grad_norm": 1.953809380531311, "learning_rate": 2.836120401337793e-05, "loss": 0.0498, "step": 10710 }, { "epoch": 7.170568561872909, "grad_norm": 3.4478530883789062, "learning_rate": 2.8294314381270905e-05, "loss": 0.0338, "step": 10720 }, { "epoch": 7.177257525083612, "grad_norm": 1.5791703462600708, "learning_rate": 2.8227424749163883e-05, "loss": 0.0459, "step": 10730 }, { "epoch": 7.183946488294314, "grad_norm": 1.9636425971984863, "learning_rate": 2.8160535117056858e-05, "loss": 0.0445, "step": 10740 }, { "epoch": 7.190635451505017, "grad_norm": 2.8458187580108643, "learning_rate": 2.8093645484949833e-05, "loss": 0.0386, "step": 10750 }, { "epoch": 7.197324414715719, "grad_norm": 1.9556761980056763, "learning_rate": 2.802675585284281e-05, "loss": 0.0501, "step": 10760 }, { "epoch": 7.2040133779264215, "grad_norm": 0.8367358446121216, "learning_rate": 2.7959866220735787e-05, "loss": 0.0484, "step": 10770 }, { "epoch": 7.210702341137123, "grad_norm": 0.5826350450515747, "learning_rate": 2.7892976588628765e-05, "loss": 0.0354, "step": 10780 }, { "epoch": 7.217391304347826, "grad_norm": 1.0834014415740967, "learning_rate": 2.782608695652174e-05, "loss": 0.0401, "step": 10790 }, { "epoch": 7.224080267558528, "grad_norm": 2.8577046394348145, "learning_rate": 2.7759197324414715e-05, "loss": 0.0389, "step": 10800 }, { "epoch": 7.230769230769231, "grad_norm": 1.558532476425171, "learning_rate": 2.7692307692307694e-05, "loss": 0.0465, "step": 10810 }, { "epoch": 7.237458193979933, "grad_norm": 1.4853328466415405, "learning_rate": 2.762541806020067e-05, "loss": 0.0425, "step": 10820 }, { "epoch": 7.2441471571906355, "grad_norm": 2.165236473083496, "learning_rate": 2.755852842809365e-05, "loss": 0.0415, "step": 10830 }, { "epoch": 7.250836120401337, "grad_norm": 0.9280331134796143, "learning_rate": 2.7491638795986625e-05, "loss": 0.0412, "step": 10840 }, { "epoch": 7.25752508361204, "grad_norm": 3.1878480911254883, "learning_rate": 2.7424749163879597e-05, "loss": 0.047, "step": 10850 }, { "epoch": 7.264214046822742, "grad_norm": 5.7657623291015625, "learning_rate": 2.735785953177258e-05, "loss": 0.0489, "step": 10860 }, { "epoch": 7.270903010033445, "grad_norm": 2.0762507915496826, "learning_rate": 2.7290969899665554e-05, "loss": 0.041, "step": 10870 }, { "epoch": 7.277591973244147, "grad_norm": 2.477379560470581, "learning_rate": 2.7224080267558526e-05, "loss": 0.0513, "step": 10880 }, { "epoch": 7.2842809364548495, "grad_norm": 1.3350303173065186, "learning_rate": 2.7157190635451507e-05, "loss": 0.048, "step": 10890 }, { "epoch": 7.290969899665551, "grad_norm": 2.863292694091797, "learning_rate": 2.7090301003344482e-05, "loss": 0.0492, "step": 10900 }, { "epoch": 7.297658862876254, "grad_norm": 4.157398700714111, "learning_rate": 2.702341137123746e-05, "loss": 0.0367, "step": 10910 }, { "epoch": 7.304347826086957, "grad_norm": 0.9202874302864075, "learning_rate": 2.6956521739130436e-05, "loss": 0.0305, "step": 10920 }, { "epoch": 7.311036789297659, "grad_norm": 0.9513516426086426, "learning_rate": 2.688963210702341e-05, "loss": 0.0409, "step": 10930 }, { "epoch": 7.317725752508361, "grad_norm": 2.04758620262146, "learning_rate": 2.682274247491639e-05, "loss": 0.0405, "step": 10940 }, { "epoch": 7.3244147157190636, "grad_norm": 4.638932228088379, "learning_rate": 2.6755852842809364e-05, "loss": 0.0473, "step": 10950 }, { "epoch": 7.331103678929766, "grad_norm": 1.4520858526229858, "learning_rate": 2.6688963210702346e-05, "loss": 0.0381, "step": 10960 }, { "epoch": 7.337792642140468, "grad_norm": 2.646759510040283, "learning_rate": 2.6622073578595318e-05, "loss": 0.0427, "step": 10970 }, { "epoch": 7.34448160535117, "grad_norm": 2.3960533142089844, "learning_rate": 2.6555183946488293e-05, "loss": 0.0464, "step": 10980 }, { "epoch": 7.351170568561873, "grad_norm": 1.6093047857284546, "learning_rate": 2.6488294314381275e-05, "loss": 0.0469, "step": 10990 }, { "epoch": 7.357859531772576, "grad_norm": 0.6260418891906738, "learning_rate": 2.6421404682274246e-05, "loss": 0.0397, "step": 11000 }, { "epoch": 7.364548494983278, "grad_norm": 4.12513542175293, "learning_rate": 2.6354515050167228e-05, "loss": 0.0452, "step": 11010 }, { "epoch": 7.3712374581939795, "grad_norm": 2.9965901374816895, "learning_rate": 2.6287625418060203e-05, "loss": 0.0348, "step": 11020 }, { "epoch": 7.377926421404682, "grad_norm": 1.5032777786254883, "learning_rate": 2.6220735785953178e-05, "loss": 0.0425, "step": 11030 }, { "epoch": 7.384615384615385, "grad_norm": 0.8793359398841858, "learning_rate": 2.6153846153846157e-05, "loss": 0.0469, "step": 11040 }, { "epoch": 7.391304347826087, "grad_norm": 2.3625988960266113, "learning_rate": 2.608695652173913e-05, "loss": 0.0365, "step": 11050 }, { "epoch": 7.39799331103679, "grad_norm": 2.840656280517578, "learning_rate": 2.602006688963211e-05, "loss": 0.0482, "step": 11060 }, { "epoch": 7.404682274247492, "grad_norm": 1.521253228187561, "learning_rate": 2.5953177257525085e-05, "loss": 0.0488, "step": 11070 }, { "epoch": 7.411371237458194, "grad_norm": 0.7154616117477417, "learning_rate": 2.588628762541806e-05, "loss": 0.0462, "step": 11080 }, { "epoch": 7.418060200668896, "grad_norm": 3.5693202018737793, "learning_rate": 2.581939799331104e-05, "loss": 0.0355, "step": 11090 }, { "epoch": 7.424749163879599, "grad_norm": 1.9669532775878906, "learning_rate": 2.5752508361204013e-05, "loss": 0.0358, "step": 11100 }, { "epoch": 7.431438127090301, "grad_norm": 0.8268588185310364, "learning_rate": 2.568561872909699e-05, "loss": 0.042, "step": 11110 }, { "epoch": 7.438127090301004, "grad_norm": 2.0618767738342285, "learning_rate": 2.5618729096989967e-05, "loss": 0.0458, "step": 11120 }, { "epoch": 7.444816053511706, "grad_norm": 3.76814341545105, "learning_rate": 2.5551839464882942e-05, "loss": 0.0428, "step": 11130 }, { "epoch": 7.451505016722408, "grad_norm": 1.0186822414398193, "learning_rate": 2.5484949832775924e-05, "loss": 0.0393, "step": 11140 }, { "epoch": 7.45819397993311, "grad_norm": 2.9750678539276123, "learning_rate": 2.54180602006689e-05, "loss": 0.0386, "step": 11150 }, { "epoch": 7.464882943143813, "grad_norm": 4.203641891479492, "learning_rate": 2.535117056856187e-05, "loss": 0.0434, "step": 11160 }, { "epoch": 7.471571906354515, "grad_norm": 1.066295862197876, "learning_rate": 2.5284280936454852e-05, "loss": 0.0439, "step": 11170 }, { "epoch": 7.478260869565218, "grad_norm": 1.0334880352020264, "learning_rate": 2.5217391304347827e-05, "loss": 0.0429, "step": 11180 }, { "epoch": 7.48494983277592, "grad_norm": 3.2917025089263916, "learning_rate": 2.5150501672240806e-05, "loss": 0.0475, "step": 11190 }, { "epoch": 7.491638795986622, "grad_norm": 3.197359800338745, "learning_rate": 2.508361204013378e-05, "loss": 0.0458, "step": 11200 }, { "epoch": 7.498327759197324, "grad_norm": 1.312531590461731, "learning_rate": 2.5016722408026756e-05, "loss": 0.0425, "step": 11210 }, { "epoch": 7.505016722408027, "grad_norm": 1.0706628561019897, "learning_rate": 2.4949832775919734e-05, "loss": 0.0351, "step": 11220 }, { "epoch": 7.511705685618729, "grad_norm": 2.260401725769043, "learning_rate": 2.488294314381271e-05, "loss": 0.0469, "step": 11230 }, { "epoch": 7.518394648829432, "grad_norm": 2.654482841491699, "learning_rate": 2.4816053511705688e-05, "loss": 0.0431, "step": 11240 }, { "epoch": 7.525083612040134, "grad_norm": 2.9351720809936523, "learning_rate": 2.4749163879598663e-05, "loss": 0.0343, "step": 11250 }, { "epoch": 7.531772575250836, "grad_norm": 3.859576463699341, "learning_rate": 2.468227424749164e-05, "loss": 0.0336, "step": 11260 }, { "epoch": 7.538461538461538, "grad_norm": 1.0542091131210327, "learning_rate": 2.461538461538462e-05, "loss": 0.0326, "step": 11270 }, { "epoch": 7.545150501672241, "grad_norm": 1.7328006029129028, "learning_rate": 2.454849498327759e-05, "loss": 0.0413, "step": 11280 }, { "epoch": 7.551839464882943, "grad_norm": 1.8054059743881226, "learning_rate": 2.448160535117057e-05, "loss": 0.0407, "step": 11290 }, { "epoch": 7.558528428093646, "grad_norm": 1.1257389783859253, "learning_rate": 2.4414715719063548e-05, "loss": 0.0427, "step": 11300 }, { "epoch": 7.565217391304348, "grad_norm": 1.9701489210128784, "learning_rate": 2.4347826086956523e-05, "loss": 0.0439, "step": 11310 }, { "epoch": 7.5719063545150505, "grad_norm": 2.4522111415863037, "learning_rate": 2.4280936454849498e-05, "loss": 0.0284, "step": 11320 }, { "epoch": 7.578595317725752, "grad_norm": 1.3907763957977295, "learning_rate": 2.4214046822742476e-05, "loss": 0.0301, "step": 11330 }, { "epoch": 7.585284280936455, "grad_norm": 0.5545759201049805, "learning_rate": 2.414715719063545e-05, "loss": 0.0344, "step": 11340 }, { "epoch": 7.591973244147157, "grad_norm": 1.364746332168579, "learning_rate": 2.408026755852843e-05, "loss": 0.0485, "step": 11350 }, { "epoch": 7.59866220735786, "grad_norm": 1.384533405303955, "learning_rate": 2.401337792642141e-05, "loss": 0.0527, "step": 11360 }, { "epoch": 7.605351170568562, "grad_norm": 1.4667154550552368, "learning_rate": 2.394648829431438e-05, "loss": 0.0462, "step": 11370 }, { "epoch": 7.6120401337792645, "grad_norm": 2.1494758129119873, "learning_rate": 2.387959866220736e-05, "loss": 0.0363, "step": 11380 }, { "epoch": 7.618729096989966, "grad_norm": 1.8320368528366089, "learning_rate": 2.3812709030100337e-05, "loss": 0.0415, "step": 11390 }, { "epoch": 7.625418060200669, "grad_norm": 1.5719565153121948, "learning_rate": 2.3745819397993312e-05, "loss": 0.0454, "step": 11400 }, { "epoch": 7.632107023411371, "grad_norm": 0.6953816413879395, "learning_rate": 2.3678929765886287e-05, "loss": 0.0366, "step": 11410 }, { "epoch": 7.638795986622074, "grad_norm": 0.7792525887489319, "learning_rate": 2.3612040133779265e-05, "loss": 0.0385, "step": 11420 }, { "epoch": 7.645484949832776, "grad_norm": 1.783232569694519, "learning_rate": 2.354515050167224e-05, "loss": 0.0362, "step": 11430 }, { "epoch": 7.6521739130434785, "grad_norm": 0.43922555446624756, "learning_rate": 2.347826086956522e-05, "loss": 0.0529, "step": 11440 }, { "epoch": 7.65886287625418, "grad_norm": 2.1042943000793457, "learning_rate": 2.3411371237458197e-05, "loss": 0.0408, "step": 11450 }, { "epoch": 7.665551839464883, "grad_norm": 1.4735808372497559, "learning_rate": 2.3344481605351172e-05, "loss": 0.0519, "step": 11460 }, { "epoch": 7.672240802675585, "grad_norm": 0.5197001099586487, "learning_rate": 2.3277591973244147e-05, "loss": 0.0519, "step": 11470 }, { "epoch": 7.678929765886288, "grad_norm": 1.7215490341186523, "learning_rate": 2.3210702341137126e-05, "loss": 0.0451, "step": 11480 }, { "epoch": 7.68561872909699, "grad_norm": 0.8570510745048523, "learning_rate": 2.31438127090301e-05, "loss": 0.033, "step": 11490 }, { "epoch": 7.6923076923076925, "grad_norm": 1.0296355485916138, "learning_rate": 2.307692307692308e-05, "loss": 0.0561, "step": 11500 }, { "epoch": 7.698996655518394, "grad_norm": 3.8507938385009766, "learning_rate": 2.3010033444816054e-05, "loss": 0.0453, "step": 11510 }, { "epoch": 7.705685618729097, "grad_norm": 1.8961751461029053, "learning_rate": 2.2943143812709033e-05, "loss": 0.0492, "step": 11520 }, { "epoch": 7.712374581939799, "grad_norm": 2.693507671356201, "learning_rate": 2.2876254180602008e-05, "loss": 0.0497, "step": 11530 }, { "epoch": 7.719063545150502, "grad_norm": 2.789548635482788, "learning_rate": 2.2809364548494986e-05, "loss": 0.0295, "step": 11540 }, { "epoch": 7.725752508361204, "grad_norm": 3.2721359729766846, "learning_rate": 2.274247491638796e-05, "loss": 0.0397, "step": 11550 }, { "epoch": 7.7324414715719065, "grad_norm": 0.6751932501792908, "learning_rate": 2.2675585284280936e-05, "loss": 0.0418, "step": 11560 }, { "epoch": 7.739130434782608, "grad_norm": 2.357602596282959, "learning_rate": 2.2608695652173914e-05, "loss": 0.0502, "step": 11570 }, { "epoch": 7.745819397993311, "grad_norm": 0.9620494246482849, "learning_rate": 2.2541806020066893e-05, "loss": 0.0487, "step": 11580 }, { "epoch": 7.752508361204013, "grad_norm": 2.187662363052368, "learning_rate": 2.2474916387959868e-05, "loss": 0.0435, "step": 11590 }, { "epoch": 7.759197324414716, "grad_norm": 1.5170892477035522, "learning_rate": 2.2408026755852843e-05, "loss": 0.0431, "step": 11600 }, { "epoch": 7.765886287625418, "grad_norm": 1.1562601327896118, "learning_rate": 2.234113712374582e-05, "loss": 0.0323, "step": 11610 }, { "epoch": 7.7725752508361206, "grad_norm": 3.6104776859283447, "learning_rate": 2.2274247491638796e-05, "loss": 0.042, "step": 11620 }, { "epoch": 7.7792642140468224, "grad_norm": 0.7847824692726135, "learning_rate": 2.2207357859531775e-05, "loss": 0.0399, "step": 11630 }, { "epoch": 7.785953177257525, "grad_norm": 1.0568705797195435, "learning_rate": 2.2140468227424753e-05, "loss": 0.0446, "step": 11640 }, { "epoch": 7.792642140468227, "grad_norm": 0.699948787689209, "learning_rate": 2.2073578595317725e-05, "loss": 0.0347, "step": 11650 }, { "epoch": 7.79933110367893, "grad_norm": 1.0860364437103271, "learning_rate": 2.2006688963210703e-05, "loss": 0.0403, "step": 11660 }, { "epoch": 7.806020066889632, "grad_norm": 0.90751713514328, "learning_rate": 2.1939799331103682e-05, "loss": 0.0413, "step": 11670 }, { "epoch": 7.812709030100335, "grad_norm": 2.4008915424346924, "learning_rate": 2.1872909698996657e-05, "loss": 0.0463, "step": 11680 }, { "epoch": 7.8193979933110365, "grad_norm": 2.8335297107696533, "learning_rate": 2.1806020066889632e-05, "loss": 0.048, "step": 11690 }, { "epoch": 7.826086956521739, "grad_norm": 0.7693464756011963, "learning_rate": 2.173913043478261e-05, "loss": 0.047, "step": 11700 }, { "epoch": 7.832775919732441, "grad_norm": 4.794638633728027, "learning_rate": 2.1672240802675585e-05, "loss": 0.0384, "step": 11710 }, { "epoch": 7.839464882943144, "grad_norm": 3.020655632019043, "learning_rate": 2.1605351170568564e-05, "loss": 0.0468, "step": 11720 }, { "epoch": 7.846153846153846, "grad_norm": 1.1203726530075073, "learning_rate": 2.1538461538461542e-05, "loss": 0.0489, "step": 11730 }, { "epoch": 7.852842809364549, "grad_norm": 0.991735577583313, "learning_rate": 2.1471571906354514e-05, "loss": 0.0393, "step": 11740 }, { "epoch": 7.8595317725752505, "grad_norm": 2.7917697429656982, "learning_rate": 2.1404682274247492e-05, "loss": 0.0433, "step": 11750 }, { "epoch": 7.866220735785953, "grad_norm": 1.8894798755645752, "learning_rate": 2.133779264214047e-05, "loss": 0.0299, "step": 11760 }, { "epoch": 7.872909698996655, "grad_norm": 2.0870378017425537, "learning_rate": 2.1270903010033446e-05, "loss": 0.0344, "step": 11770 }, { "epoch": 7.879598662207358, "grad_norm": 0.9423313140869141, "learning_rate": 2.120401337792642e-05, "loss": 0.0449, "step": 11780 }, { "epoch": 7.88628762541806, "grad_norm": 0.9477387070655823, "learning_rate": 2.11371237458194e-05, "loss": 0.0361, "step": 11790 }, { "epoch": 7.892976588628763, "grad_norm": 0.6451069712638855, "learning_rate": 2.1070234113712374e-05, "loss": 0.053, "step": 11800 }, { "epoch": 7.8996655518394645, "grad_norm": 5.747945308685303, "learning_rate": 2.1003344481605352e-05, "loss": 0.0429, "step": 11810 }, { "epoch": 7.906354515050167, "grad_norm": 0.995347797870636, "learning_rate": 2.093645484949833e-05, "loss": 0.0343, "step": 11820 }, { "epoch": 7.913043478260869, "grad_norm": 2.1813809871673584, "learning_rate": 2.0869565217391303e-05, "loss": 0.0487, "step": 11830 }, { "epoch": 7.919732441471572, "grad_norm": 1.4194324016571045, "learning_rate": 2.080267558528428e-05, "loss": 0.0435, "step": 11840 }, { "epoch": 7.926421404682275, "grad_norm": 0.39779338240623474, "learning_rate": 2.073578595317726e-05, "loss": 0.0369, "step": 11850 }, { "epoch": 7.933110367892977, "grad_norm": 4.078008651733398, "learning_rate": 2.0668896321070234e-05, "loss": 0.0424, "step": 11860 }, { "epoch": 7.9397993311036785, "grad_norm": 1.5877165794372559, "learning_rate": 2.0602006688963213e-05, "loss": 0.0333, "step": 11870 }, { "epoch": 7.946488294314381, "grad_norm": 2.7322449684143066, "learning_rate": 2.0535117056856188e-05, "loss": 0.0421, "step": 11880 }, { "epoch": 7.953177257525084, "grad_norm": 2.233501434326172, "learning_rate": 2.0468227424749163e-05, "loss": 0.0452, "step": 11890 }, { "epoch": 7.959866220735786, "grad_norm": 1.474515676498413, "learning_rate": 2.040133779264214e-05, "loss": 0.0375, "step": 11900 }, { "epoch": 7.966555183946488, "grad_norm": 3.1244900226593018, "learning_rate": 2.033444816053512e-05, "loss": 0.0429, "step": 11910 }, { "epoch": 7.973244147157191, "grad_norm": 2.144104480743408, "learning_rate": 2.0267558528428095e-05, "loss": 0.0346, "step": 11920 }, { "epoch": 7.979933110367893, "grad_norm": 0.5135546922683716, "learning_rate": 2.020066889632107e-05, "loss": 0.0266, "step": 11930 }, { "epoch": 7.986622073578595, "grad_norm": 2.0495548248291016, "learning_rate": 2.0133779264214048e-05, "loss": 0.038, "step": 11940 }, { "epoch": 7.993311036789297, "grad_norm": 0.7567475438117981, "learning_rate": 2.0066889632107023e-05, "loss": 0.0334, "step": 11950 }, { "epoch": 8.0, "grad_norm": 0.2615234851837158, "learning_rate": 2e-05, "loss": 0.0326, "step": 11960 }, { "epoch": 8.0, "eval_loss": 0.08834142982959747, "eval_mse": 0.08834142982959747, "eval_runtime": 229.8467, "eval_samples_per_second": 13.009, "eval_steps_per_second": 1.627, "step": 11960 }, { "epoch": 8.006688963210703, "grad_norm": 3.9783406257629395, "learning_rate": 1.9933110367892977e-05, "loss": 0.0337, "step": 11970 }, { "epoch": 8.013377926421406, "grad_norm": 1.0547821521759033, "learning_rate": 1.9866220735785955e-05, "loss": 0.0265, "step": 11980 }, { "epoch": 8.020066889632107, "grad_norm": 3.261388063430786, "learning_rate": 1.979933110367893e-05, "loss": 0.028, "step": 11990 }, { "epoch": 8.02675585284281, "grad_norm": 1.504028558731079, "learning_rate": 1.973244147157191e-05, "loss": 0.029, "step": 12000 }, { "epoch": 8.033444816053512, "grad_norm": 3.6684327125549316, "learning_rate": 1.9665551839464884e-05, "loss": 0.0311, "step": 12010 }, { "epoch": 8.040133779264215, "grad_norm": 2.4501540660858154, "learning_rate": 1.959866220735786e-05, "loss": 0.0313, "step": 12020 }, { "epoch": 8.046822742474916, "grad_norm": 3.4440808296203613, "learning_rate": 1.9531772575250837e-05, "loss": 0.0382, "step": 12030 }, { "epoch": 8.053511705685619, "grad_norm": 0.9953986406326294, "learning_rate": 1.9464882943143815e-05, "loss": 0.0313, "step": 12040 }, { "epoch": 8.060200668896321, "grad_norm": 0.80042564868927, "learning_rate": 1.939799331103679e-05, "loss": 0.0269, "step": 12050 }, { "epoch": 8.066889632107024, "grad_norm": 1.4230451583862305, "learning_rate": 1.9331103678929765e-05, "loss": 0.0236, "step": 12060 }, { "epoch": 8.073578595317725, "grad_norm": 2.201822519302368, "learning_rate": 1.9264214046822744e-05, "loss": 0.0294, "step": 12070 }, { "epoch": 8.080267558528428, "grad_norm": 0.8343908786773682, "learning_rate": 1.919732441471572e-05, "loss": 0.0302, "step": 12080 }, { "epoch": 8.08695652173913, "grad_norm": 1.7819195985794067, "learning_rate": 1.9130434782608697e-05, "loss": 0.0152, "step": 12090 }, { "epoch": 8.093645484949834, "grad_norm": 0.8807457685470581, "learning_rate": 1.9063545150501676e-05, "loss": 0.0176, "step": 12100 }, { "epoch": 8.100334448160535, "grad_norm": 0.37105417251586914, "learning_rate": 1.8996655518394647e-05, "loss": 0.0311, "step": 12110 }, { "epoch": 8.107023411371237, "grad_norm": 3.2414398193359375, "learning_rate": 1.8929765886287626e-05, "loss": 0.0311, "step": 12120 }, { "epoch": 8.11371237458194, "grad_norm": 3.936710834503174, "learning_rate": 1.8862876254180604e-05, "loss": 0.0267, "step": 12130 }, { "epoch": 8.120401337792643, "grad_norm": 1.1247128248214722, "learning_rate": 1.879598662207358e-05, "loss": 0.033, "step": 12140 }, { "epoch": 8.127090301003344, "grad_norm": 3.2280757427215576, "learning_rate": 1.8729096989966554e-05, "loss": 0.0286, "step": 12150 }, { "epoch": 8.133779264214047, "grad_norm": 2.556563138961792, "learning_rate": 1.8662207357859533e-05, "loss": 0.0294, "step": 12160 }, { "epoch": 8.14046822742475, "grad_norm": 0.8224531412124634, "learning_rate": 1.8595317725752508e-05, "loss": 0.0309, "step": 12170 }, { "epoch": 8.147157190635452, "grad_norm": 0.8672958016395569, "learning_rate": 1.8528428093645486e-05, "loss": 0.0211, "step": 12180 }, { "epoch": 8.153846153846153, "grad_norm": 1.4439815282821655, "learning_rate": 1.8461538461538465e-05, "loss": 0.0296, "step": 12190 }, { "epoch": 8.160535117056856, "grad_norm": 0.7979484796524048, "learning_rate": 1.8394648829431436e-05, "loss": 0.0373, "step": 12200 }, { "epoch": 8.167224080267559, "grad_norm": 5.962237358093262, "learning_rate": 1.8327759197324415e-05, "loss": 0.0333, "step": 12210 }, { "epoch": 8.173913043478262, "grad_norm": 1.737164855003357, "learning_rate": 1.8260869565217393e-05, "loss": 0.0225, "step": 12220 }, { "epoch": 8.180602006688963, "grad_norm": 2.052433729171753, "learning_rate": 1.8193979933110368e-05, "loss": 0.0321, "step": 12230 }, { "epoch": 8.187290969899665, "grad_norm": 1.0267506837844849, "learning_rate": 1.8127090301003347e-05, "loss": 0.0341, "step": 12240 }, { "epoch": 8.193979933110368, "grad_norm": 0.891290545463562, "learning_rate": 1.806020066889632e-05, "loss": 0.0251, "step": 12250 }, { "epoch": 8.200668896321071, "grad_norm": 1.1818610429763794, "learning_rate": 1.7993311036789297e-05, "loss": 0.0355, "step": 12260 }, { "epoch": 8.207357859531772, "grad_norm": 2.8808891773223877, "learning_rate": 1.7926421404682275e-05, "loss": 0.0363, "step": 12270 }, { "epoch": 8.214046822742475, "grad_norm": 1.9035134315490723, "learning_rate": 1.7859531772575253e-05, "loss": 0.0243, "step": 12280 }, { "epoch": 8.220735785953178, "grad_norm": 1.533340573310852, "learning_rate": 1.779264214046823e-05, "loss": 0.0434, "step": 12290 }, { "epoch": 8.22742474916388, "grad_norm": 1.0126433372497559, "learning_rate": 1.7725752508361204e-05, "loss": 0.0266, "step": 12300 }, { "epoch": 8.234113712374581, "grad_norm": 1.2792842388153076, "learning_rate": 1.7658862876254182e-05, "loss": 0.0268, "step": 12310 }, { "epoch": 8.240802675585284, "grad_norm": 0.51844722032547, "learning_rate": 1.7591973244147157e-05, "loss": 0.0324, "step": 12320 }, { "epoch": 8.247491638795987, "grad_norm": 1.5483566522598267, "learning_rate": 1.7525083612040135e-05, "loss": 0.0356, "step": 12330 }, { "epoch": 8.25418060200669, "grad_norm": 2.015435218811035, "learning_rate": 1.745819397993311e-05, "loss": 0.0305, "step": 12340 }, { "epoch": 8.26086956521739, "grad_norm": 1.5077990293502808, "learning_rate": 1.739130434782609e-05, "loss": 0.0258, "step": 12350 }, { "epoch": 8.267558528428093, "grad_norm": 2.064847946166992, "learning_rate": 1.7324414715719064e-05, "loss": 0.022, "step": 12360 }, { "epoch": 8.274247491638796, "grad_norm": 2.8898634910583496, "learning_rate": 1.7257525083612042e-05, "loss": 0.0386, "step": 12370 }, { "epoch": 8.280936454849499, "grad_norm": 1.5126811265945435, "learning_rate": 1.7190635451505017e-05, "loss": 0.0348, "step": 12380 }, { "epoch": 8.2876254180602, "grad_norm": 1.4772700071334839, "learning_rate": 1.7123745819397992e-05, "loss": 0.0414, "step": 12390 }, { "epoch": 8.294314381270903, "grad_norm": 1.3124395608901978, "learning_rate": 1.705685618729097e-05, "loss": 0.0363, "step": 12400 }, { "epoch": 8.301003344481606, "grad_norm": 0.8630233407020569, "learning_rate": 1.698996655518395e-05, "loss": 0.0281, "step": 12410 }, { "epoch": 8.307692307692308, "grad_norm": 1.1018539667129517, "learning_rate": 1.6923076923076924e-05, "loss": 0.0248, "step": 12420 }, { "epoch": 8.31438127090301, "grad_norm": 1.0768871307373047, "learning_rate": 1.68561872909699e-05, "loss": 0.0292, "step": 12430 }, { "epoch": 8.321070234113712, "grad_norm": 1.1558284759521484, "learning_rate": 1.6789297658862878e-05, "loss": 0.0289, "step": 12440 }, { "epoch": 8.327759197324415, "grad_norm": 2.288069248199463, "learning_rate": 1.6722408026755853e-05, "loss": 0.0237, "step": 12450 }, { "epoch": 8.334448160535118, "grad_norm": 0.7263951897621155, "learning_rate": 1.665551839464883e-05, "loss": 0.033, "step": 12460 }, { "epoch": 8.341137123745819, "grad_norm": 4.321417331695557, "learning_rate": 1.658862876254181e-05, "loss": 0.0406, "step": 12470 }, { "epoch": 8.347826086956522, "grad_norm": 0.8176807761192322, "learning_rate": 1.652173913043478e-05, "loss": 0.0172, "step": 12480 }, { "epoch": 8.354515050167224, "grad_norm": 0.8886902928352356, "learning_rate": 1.645484949832776e-05, "loss": 0.0246, "step": 12490 }, { "epoch": 8.361204013377927, "grad_norm": 0.4712463915348053, "learning_rate": 1.6387959866220738e-05, "loss": 0.0249, "step": 12500 }, { "epoch": 8.367892976588628, "grad_norm": 1.5403294563293457, "learning_rate": 1.6321070234113713e-05, "loss": 0.0295, "step": 12510 }, { "epoch": 8.37458193979933, "grad_norm": 0.9423514604568481, "learning_rate": 1.625418060200669e-05, "loss": 0.0314, "step": 12520 }, { "epoch": 8.381270903010034, "grad_norm": 1.1313178539276123, "learning_rate": 1.6187290969899666e-05, "loss": 0.0281, "step": 12530 }, { "epoch": 8.387959866220736, "grad_norm": 2.3116352558135986, "learning_rate": 1.612040133779264e-05, "loss": 0.027, "step": 12540 }, { "epoch": 8.394648829431437, "grad_norm": 0.6153112649917603, "learning_rate": 1.605351170568562e-05, "loss": 0.0295, "step": 12550 }, { "epoch": 8.40133779264214, "grad_norm": 0.900283932685852, "learning_rate": 1.59866220735786e-05, "loss": 0.0204, "step": 12560 }, { "epoch": 8.408026755852843, "grad_norm": 0.7899017333984375, "learning_rate": 1.591973244147157e-05, "loss": 0.0317, "step": 12570 }, { "epoch": 8.414715719063546, "grad_norm": 1.4934643507003784, "learning_rate": 1.585284280936455e-05, "loss": 0.0272, "step": 12580 }, { "epoch": 8.421404682274247, "grad_norm": 3.2169320583343506, "learning_rate": 1.5785953177257527e-05, "loss": 0.0331, "step": 12590 }, { "epoch": 8.42809364548495, "grad_norm": 0.30572786927223206, "learning_rate": 1.5719063545150502e-05, "loss": 0.0195, "step": 12600 }, { "epoch": 8.434782608695652, "grad_norm": 3.049567222595215, "learning_rate": 1.565217391304348e-05, "loss": 0.0368, "step": 12610 }, { "epoch": 8.441471571906355, "grad_norm": 0.841004490852356, "learning_rate": 1.5585284280936455e-05, "loss": 0.0302, "step": 12620 }, { "epoch": 8.448160535117056, "grad_norm": 1.2091842889785767, "learning_rate": 1.551839464882943e-05, "loss": 0.0294, "step": 12630 }, { "epoch": 8.454849498327759, "grad_norm": 0.7122257947921753, "learning_rate": 1.545150501672241e-05, "loss": 0.0175, "step": 12640 }, { "epoch": 8.461538461538462, "grad_norm": 2.2872982025146484, "learning_rate": 1.5384615384615387e-05, "loss": 0.0301, "step": 12650 }, { "epoch": 8.468227424749164, "grad_norm": 2.8640732765197754, "learning_rate": 1.5317725752508362e-05, "loss": 0.0418, "step": 12660 }, { "epoch": 8.474916387959865, "grad_norm": 1.7518962621688843, "learning_rate": 1.5250836120401337e-05, "loss": 0.0354, "step": 12670 }, { "epoch": 8.481605351170568, "grad_norm": 1.138287901878357, "learning_rate": 1.5183946488294316e-05, "loss": 0.0299, "step": 12680 }, { "epoch": 8.488294314381271, "grad_norm": 3.845442533493042, "learning_rate": 1.5117056856187292e-05, "loss": 0.0221, "step": 12690 }, { "epoch": 8.494983277591974, "grad_norm": 0.7240458726882935, "learning_rate": 1.5050167224080269e-05, "loss": 0.0307, "step": 12700 }, { "epoch": 8.501672240802675, "grad_norm": 2.00895094871521, "learning_rate": 1.4983277591973244e-05, "loss": 0.0242, "step": 12710 }, { "epoch": 8.508361204013378, "grad_norm": 0.557636559009552, "learning_rate": 1.4916387959866221e-05, "loss": 0.0207, "step": 12720 }, { "epoch": 8.51505016722408, "grad_norm": 0.8012935519218445, "learning_rate": 1.4849498327759198e-05, "loss": 0.0241, "step": 12730 }, { "epoch": 8.521739130434783, "grad_norm": 0.6977702975273132, "learning_rate": 1.4782608695652176e-05, "loss": 0.0275, "step": 12740 }, { "epoch": 8.528428093645484, "grad_norm": 1.9219177961349487, "learning_rate": 1.4715719063545153e-05, "loss": 0.024, "step": 12750 }, { "epoch": 8.535117056856187, "grad_norm": 1.120877981185913, "learning_rate": 1.4648829431438128e-05, "loss": 0.0268, "step": 12760 }, { "epoch": 8.54180602006689, "grad_norm": 2.783856153488159, "learning_rate": 1.4581939799331104e-05, "loss": 0.0362, "step": 12770 }, { "epoch": 8.548494983277592, "grad_norm": 1.8937408924102783, "learning_rate": 1.4515050167224081e-05, "loss": 0.0329, "step": 12780 }, { "epoch": 8.555183946488294, "grad_norm": 1.783046007156372, "learning_rate": 1.4448160535117058e-05, "loss": 0.0372, "step": 12790 }, { "epoch": 8.561872909698996, "grad_norm": 1.2558856010437012, "learning_rate": 1.4381270903010033e-05, "loss": 0.0229, "step": 12800 }, { "epoch": 8.568561872909699, "grad_norm": 1.831705927848816, "learning_rate": 1.431438127090301e-05, "loss": 0.0224, "step": 12810 }, { "epoch": 8.575250836120402, "grad_norm": 2.1247737407684326, "learning_rate": 1.4247491638795986e-05, "loss": 0.0274, "step": 12820 }, { "epoch": 8.581939799331103, "grad_norm": 1.0554836988449097, "learning_rate": 1.4180602006688965e-05, "loss": 0.0335, "step": 12830 }, { "epoch": 8.588628762541806, "grad_norm": 1.7146085500717163, "learning_rate": 1.4113712374581942e-05, "loss": 0.0323, "step": 12840 }, { "epoch": 8.595317725752508, "grad_norm": 1.0601108074188232, "learning_rate": 1.4046822742474917e-05, "loss": 0.0253, "step": 12850 }, { "epoch": 8.602006688963211, "grad_norm": 0.7944843173027039, "learning_rate": 1.3979933110367893e-05, "loss": 0.0309, "step": 12860 }, { "epoch": 8.608695652173914, "grad_norm": 1.291366457939148, "learning_rate": 1.391304347826087e-05, "loss": 0.0222, "step": 12870 }, { "epoch": 8.615384615384615, "grad_norm": 2.837893009185791, "learning_rate": 1.3846153846153847e-05, "loss": 0.0344, "step": 12880 }, { "epoch": 8.622073578595318, "grad_norm": 2.289451837539673, "learning_rate": 1.3779264214046825e-05, "loss": 0.0201, "step": 12890 }, { "epoch": 8.62876254180602, "grad_norm": 1.1213974952697754, "learning_rate": 1.3712374581939799e-05, "loss": 0.0193, "step": 12900 }, { "epoch": 8.635451505016722, "grad_norm": 2.0777931213378906, "learning_rate": 1.3645484949832777e-05, "loss": 0.0217, "step": 12910 }, { "epoch": 8.642140468227424, "grad_norm": 1.627186894416809, "learning_rate": 1.3578595317725754e-05, "loss": 0.0298, "step": 12920 }, { "epoch": 8.648829431438127, "grad_norm": 1.691390037536621, "learning_rate": 1.351170568561873e-05, "loss": 0.0371, "step": 12930 }, { "epoch": 8.65551839464883, "grad_norm": 4.63961935043335, "learning_rate": 1.3444816053511705e-05, "loss": 0.0411, "step": 12940 }, { "epoch": 8.662207357859533, "grad_norm": 0.9765155911445618, "learning_rate": 1.3377926421404682e-05, "loss": 0.0354, "step": 12950 }, { "epoch": 8.668896321070234, "grad_norm": 1.394568920135498, "learning_rate": 1.3311036789297659e-05, "loss": 0.0327, "step": 12960 }, { "epoch": 8.675585284280936, "grad_norm": 0.6211327314376831, "learning_rate": 1.3244147157190637e-05, "loss": 0.0254, "step": 12970 }, { "epoch": 8.68227424749164, "grad_norm": 3.3042550086975098, "learning_rate": 1.3177257525083614e-05, "loss": 0.029, "step": 12980 }, { "epoch": 8.68896321070234, "grad_norm": 1.2794384956359863, "learning_rate": 1.3110367892976589e-05, "loss": 0.0408, "step": 12990 }, { "epoch": 8.695652173913043, "grad_norm": 1.3925954103469849, "learning_rate": 1.3043478260869566e-05, "loss": 0.0363, "step": 13000 }, { "epoch": 8.702341137123746, "grad_norm": 2.9687554836273193, "learning_rate": 1.2976588628762542e-05, "loss": 0.0245, "step": 13010 }, { "epoch": 8.709030100334449, "grad_norm": 2.5190224647521973, "learning_rate": 1.290969899665552e-05, "loss": 0.0344, "step": 13020 }, { "epoch": 8.715719063545151, "grad_norm": 0.9365471005439758, "learning_rate": 1.2842809364548494e-05, "loss": 0.0263, "step": 13030 }, { "epoch": 8.722408026755852, "grad_norm": 1.4775091409683228, "learning_rate": 1.2775919732441471e-05, "loss": 0.0256, "step": 13040 }, { "epoch": 8.729096989966555, "grad_norm": 3.659803867340088, "learning_rate": 1.270903010033445e-05, "loss": 0.0328, "step": 13050 }, { "epoch": 8.735785953177258, "grad_norm": 1.5067414045333862, "learning_rate": 1.2642140468227426e-05, "loss": 0.0303, "step": 13060 }, { "epoch": 8.742474916387959, "grad_norm": 1.6895188093185425, "learning_rate": 1.2575250836120403e-05, "loss": 0.0248, "step": 13070 }, { "epoch": 8.749163879598662, "grad_norm": 2.7330615520477295, "learning_rate": 1.2508361204013378e-05, "loss": 0.0314, "step": 13080 }, { "epoch": 8.755852842809364, "grad_norm": 2.122676134109497, "learning_rate": 1.2441471571906355e-05, "loss": 0.0284, "step": 13090 }, { "epoch": 8.762541806020067, "grad_norm": 1.499622106552124, "learning_rate": 1.2374581939799331e-05, "loss": 0.0288, "step": 13100 }, { "epoch": 8.76923076923077, "grad_norm": 3.3748297691345215, "learning_rate": 1.230769230769231e-05, "loss": 0.0284, "step": 13110 }, { "epoch": 8.775919732441471, "grad_norm": 0.9968297481536865, "learning_rate": 1.2240802675585285e-05, "loss": 0.0279, "step": 13120 }, { "epoch": 8.782608695652174, "grad_norm": 0.6592386960983276, "learning_rate": 1.2173913043478261e-05, "loss": 0.0316, "step": 13130 }, { "epoch": 8.789297658862877, "grad_norm": 1.2029937505722046, "learning_rate": 1.2107023411371238e-05, "loss": 0.0244, "step": 13140 }, { "epoch": 8.79598662207358, "grad_norm": 0.8343057632446289, "learning_rate": 1.2040133779264215e-05, "loss": 0.0242, "step": 13150 }, { "epoch": 8.80267558528428, "grad_norm": 1.8353803157806396, "learning_rate": 1.197324414715719e-05, "loss": 0.0219, "step": 13160 }, { "epoch": 8.809364548494983, "grad_norm": 1.6544325351715088, "learning_rate": 1.1906354515050168e-05, "loss": 0.0272, "step": 13170 }, { "epoch": 8.816053511705686, "grad_norm": 0.7823971509933472, "learning_rate": 1.1839464882943143e-05, "loss": 0.0275, "step": 13180 }, { "epoch": 8.822742474916389, "grad_norm": 0.49826306104660034, "learning_rate": 1.177257525083612e-05, "loss": 0.0217, "step": 13190 }, { "epoch": 8.82943143812709, "grad_norm": 1.0228309631347656, "learning_rate": 1.1705685618729099e-05, "loss": 0.0241, "step": 13200 }, { "epoch": 8.836120401337793, "grad_norm": 2.224672317504883, "learning_rate": 1.1638795986622074e-05, "loss": 0.0317, "step": 13210 }, { "epoch": 8.842809364548495, "grad_norm": 2.9379260540008545, "learning_rate": 1.157190635451505e-05, "loss": 0.0318, "step": 13220 }, { "epoch": 8.849498327759198, "grad_norm": 2.134108066558838, "learning_rate": 1.1505016722408027e-05, "loss": 0.0195, "step": 13230 }, { "epoch": 8.856187290969899, "grad_norm": 3.203746795654297, "learning_rate": 1.1438127090301004e-05, "loss": 0.0285, "step": 13240 }, { "epoch": 8.862876254180602, "grad_norm": 0.8799943923950195, "learning_rate": 1.137123745819398e-05, "loss": 0.0369, "step": 13250 }, { "epoch": 8.869565217391305, "grad_norm": 0.89390629529953, "learning_rate": 1.1304347826086957e-05, "loss": 0.0339, "step": 13260 }, { "epoch": 8.876254180602007, "grad_norm": 2.9978513717651367, "learning_rate": 1.1237458193979934e-05, "loss": 0.0361, "step": 13270 }, { "epoch": 8.882943143812708, "grad_norm": 1.2196823358535767, "learning_rate": 1.117056856187291e-05, "loss": 0.0295, "step": 13280 }, { "epoch": 8.889632107023411, "grad_norm": 0.9094672799110413, "learning_rate": 1.1103678929765887e-05, "loss": 0.0254, "step": 13290 }, { "epoch": 8.896321070234114, "grad_norm": 2.8375725746154785, "learning_rate": 1.1036789297658862e-05, "loss": 0.0246, "step": 13300 }, { "epoch": 8.903010033444817, "grad_norm": 2.510275363922119, "learning_rate": 1.0969899665551841e-05, "loss": 0.0271, "step": 13310 }, { "epoch": 8.909698996655518, "grad_norm": 1.341292381286621, "learning_rate": 1.0903010033444816e-05, "loss": 0.02, "step": 13320 }, { "epoch": 8.91638795986622, "grad_norm": 0.7900250554084778, "learning_rate": 1.0836120401337793e-05, "loss": 0.0255, "step": 13330 }, { "epoch": 8.923076923076923, "grad_norm": 2.9481964111328125, "learning_rate": 1.0769230769230771e-05, "loss": 0.0329, "step": 13340 }, { "epoch": 8.929765886287626, "grad_norm": 1.1457481384277344, "learning_rate": 1.0702341137123746e-05, "loss": 0.0283, "step": 13350 }, { "epoch": 8.936454849498327, "grad_norm": 1.039831280708313, "learning_rate": 1.0635451505016723e-05, "loss": 0.0258, "step": 13360 }, { "epoch": 8.94314381270903, "grad_norm": 0.9117240309715271, "learning_rate": 1.05685618729097e-05, "loss": 0.0353, "step": 13370 }, { "epoch": 8.949832775919733, "grad_norm": 0.45773544907569885, "learning_rate": 1.0501672240802676e-05, "loss": 0.0285, "step": 13380 }, { "epoch": 8.956521739130435, "grad_norm": 0.676830530166626, "learning_rate": 1.0434782608695651e-05, "loss": 0.0208, "step": 13390 }, { "epoch": 8.963210702341136, "grad_norm": 1.4347857236862183, "learning_rate": 1.036789297658863e-05, "loss": 0.0281, "step": 13400 }, { "epoch": 8.96989966555184, "grad_norm": 0.6630375981330872, "learning_rate": 1.0301003344481606e-05, "loss": 0.0158, "step": 13410 }, { "epoch": 8.976588628762542, "grad_norm": 0.9883447289466858, "learning_rate": 1.0234113712374581e-05, "loss": 0.0208, "step": 13420 }, { "epoch": 8.983277591973245, "grad_norm": 1.1196333169937134, "learning_rate": 1.016722408026756e-05, "loss": 0.0248, "step": 13430 }, { "epoch": 8.989966555183946, "grad_norm": 1.1069833040237427, "learning_rate": 1.0100334448160535e-05, "loss": 0.0318, "step": 13440 }, { "epoch": 8.996655518394649, "grad_norm": 1.9332762956619263, "learning_rate": 1.0033444816053512e-05, "loss": 0.0311, "step": 13450 }, { "epoch": 9.0, "eval_loss": 0.0904405489563942, "eval_mse": 0.09044055640697479, "eval_runtime": 224.6401, "eval_samples_per_second": 13.31, "eval_steps_per_second": 1.665, "step": 13455 }, { "epoch": 9.003344481605351, "grad_norm": 0.9701605439186096, "learning_rate": 9.966555183946488e-06, "loss": 0.0237, "step": 13460 }, { "epoch": 9.010033444816054, "grad_norm": 2.653738260269165, "learning_rate": 9.899665551839465e-06, "loss": 0.0257, "step": 13470 }, { "epoch": 9.016722408026755, "grad_norm": 0.45054054260253906, "learning_rate": 9.832775919732442e-06, "loss": 0.0152, "step": 13480 }, { "epoch": 9.023411371237458, "grad_norm": 2.150256395339966, "learning_rate": 9.765886287625419e-06, "loss": 0.0194, "step": 13490 }, { "epoch": 9.03010033444816, "grad_norm": 1.6995000839233398, "learning_rate": 9.698996655518395e-06, "loss": 0.0217, "step": 13500 }, { "epoch": 9.036789297658864, "grad_norm": 0.4809478223323822, "learning_rate": 9.632107023411372e-06, "loss": 0.0187, "step": 13510 }, { "epoch": 9.043478260869565, "grad_norm": 1.1911168098449707, "learning_rate": 9.565217391304349e-06, "loss": 0.0142, "step": 13520 }, { "epoch": 9.050167224080267, "grad_norm": 2.6061320304870605, "learning_rate": 9.498327759197324e-06, "loss": 0.0227, "step": 13530 }, { "epoch": 9.05685618729097, "grad_norm": 0.42797496914863586, "learning_rate": 9.431438127090302e-06, "loss": 0.0195, "step": 13540 }, { "epoch": 9.063545150501673, "grad_norm": 1.5085166692733765, "learning_rate": 9.364548494983277e-06, "loss": 0.0269, "step": 13550 }, { "epoch": 9.070234113712374, "grad_norm": 0.48043566942214966, "learning_rate": 9.297658862876254e-06, "loss": 0.0165, "step": 13560 }, { "epoch": 9.076923076923077, "grad_norm": 1.4516146183013916, "learning_rate": 9.230769230769232e-06, "loss": 0.0162, "step": 13570 }, { "epoch": 9.08361204013378, "grad_norm": 0.9354628920555115, "learning_rate": 9.163879598662207e-06, "loss": 0.0185, "step": 13580 }, { "epoch": 9.090301003344482, "grad_norm": 0.7186703681945801, "learning_rate": 9.096989966555184e-06, "loss": 0.0164, "step": 13590 }, { "epoch": 9.096989966555183, "grad_norm": 0.677481472492218, "learning_rate": 9.03010033444816e-06, "loss": 0.0128, "step": 13600 }, { "epoch": 9.103678929765886, "grad_norm": 1.1981468200683594, "learning_rate": 8.963210702341138e-06, "loss": 0.0168, "step": 13610 }, { "epoch": 9.110367892976589, "grad_norm": 1.0445246696472168, "learning_rate": 8.896321070234114e-06, "loss": 0.0154, "step": 13620 }, { "epoch": 9.117056856187292, "grad_norm": 2.4534506797790527, "learning_rate": 8.829431438127091e-06, "loss": 0.0179, "step": 13630 }, { "epoch": 9.123745819397993, "grad_norm": 1.3992637395858765, "learning_rate": 8.762541806020068e-06, "loss": 0.0122, "step": 13640 }, { "epoch": 9.130434782608695, "grad_norm": 0.882490873336792, "learning_rate": 8.695652173913044e-06, "loss": 0.0178, "step": 13650 }, { "epoch": 9.137123745819398, "grad_norm": 2.6310842037200928, "learning_rate": 8.628762541806021e-06, "loss": 0.0206, "step": 13660 }, { "epoch": 9.143812709030101, "grad_norm": 2.426238536834717, "learning_rate": 8.561872909698996e-06, "loss": 0.0215, "step": 13670 }, { "epoch": 9.150501672240802, "grad_norm": 1.248909831047058, "learning_rate": 8.494983277591975e-06, "loss": 0.017, "step": 13680 }, { "epoch": 9.157190635451505, "grad_norm": 0.7255083322525024, "learning_rate": 8.42809364548495e-06, "loss": 0.0225, "step": 13690 }, { "epoch": 9.163879598662207, "grad_norm": 1.6974523067474365, "learning_rate": 8.361204013377926e-06, "loss": 0.0176, "step": 13700 }, { "epoch": 9.17056856187291, "grad_norm": 1.3242517709732056, "learning_rate": 8.294314381270905e-06, "loss": 0.0248, "step": 13710 }, { "epoch": 9.177257525083611, "grad_norm": 1.7846802473068237, "learning_rate": 8.22742474916388e-06, "loss": 0.0199, "step": 13720 }, { "epoch": 9.183946488294314, "grad_norm": 1.2862492799758911, "learning_rate": 8.160535117056857e-06, "loss": 0.0229, "step": 13730 }, { "epoch": 9.190635451505017, "grad_norm": 2.379707098007202, "learning_rate": 8.093645484949833e-06, "loss": 0.0218, "step": 13740 }, { "epoch": 9.19732441471572, "grad_norm": 1.0562094449996948, "learning_rate": 8.02675585284281e-06, "loss": 0.0169, "step": 13750 }, { "epoch": 9.20401337792642, "grad_norm": 2.632425308227539, "learning_rate": 7.959866220735785e-06, "loss": 0.0288, "step": 13760 }, { "epoch": 9.210702341137123, "grad_norm": 0.6736680865287781, "learning_rate": 7.892976588628763e-06, "loss": 0.0157, "step": 13770 }, { "epoch": 9.217391304347826, "grad_norm": 2.769725799560547, "learning_rate": 7.82608695652174e-06, "loss": 0.0193, "step": 13780 }, { "epoch": 9.224080267558529, "grad_norm": 1.3798658847808838, "learning_rate": 7.759197324414715e-06, "loss": 0.0335, "step": 13790 }, { "epoch": 9.23076923076923, "grad_norm": 0.8057318329811096, "learning_rate": 7.692307692307694e-06, "loss": 0.0226, "step": 13800 }, { "epoch": 9.237458193979933, "grad_norm": 1.60343337059021, "learning_rate": 7.625418060200669e-06, "loss": 0.0213, "step": 13810 }, { "epoch": 9.244147157190636, "grad_norm": 0.9643651247024536, "learning_rate": 7.558528428093646e-06, "loss": 0.0201, "step": 13820 }, { "epoch": 9.250836120401338, "grad_norm": 1.777146577835083, "learning_rate": 7.491638795986622e-06, "loss": 0.0198, "step": 13830 }, { "epoch": 9.25752508361204, "grad_norm": 0.708709180355072, "learning_rate": 7.424749163879599e-06, "loss": 0.0134, "step": 13840 }, { "epoch": 9.264214046822742, "grad_norm": 0.22371377050876617, "learning_rate": 7.357859531772576e-06, "loss": 0.0186, "step": 13850 }, { "epoch": 9.270903010033445, "grad_norm": 3.0788733959198, "learning_rate": 7.290969899665552e-06, "loss": 0.0178, "step": 13860 }, { "epoch": 9.277591973244148, "grad_norm": 1.888277530670166, "learning_rate": 7.224080267558529e-06, "loss": 0.0184, "step": 13870 }, { "epoch": 9.284280936454849, "grad_norm": 1.3582139015197754, "learning_rate": 7.157190635451505e-06, "loss": 0.026, "step": 13880 }, { "epoch": 9.290969899665551, "grad_norm": 1.5523234605789185, "learning_rate": 7.090301003344482e-06, "loss": 0.0173, "step": 13890 }, { "epoch": 9.297658862876254, "grad_norm": 0.42221876978874207, "learning_rate": 7.023411371237458e-06, "loss": 0.023, "step": 13900 }, { "epoch": 9.304347826086957, "grad_norm": 0.5811980962753296, "learning_rate": 6.956521739130435e-06, "loss": 0.0175, "step": 13910 }, { "epoch": 9.31103678929766, "grad_norm": 1.5026706457138062, "learning_rate": 6.889632107023413e-06, "loss": 0.0126, "step": 13920 }, { "epoch": 9.31772575250836, "grad_norm": 0.8601313233375549, "learning_rate": 6.8227424749163885e-06, "loss": 0.0266, "step": 13930 }, { "epoch": 9.324414715719064, "grad_norm": 0.26971110701560974, "learning_rate": 6.755852842809365e-06, "loss": 0.0122, "step": 13940 }, { "epoch": 9.331103678929766, "grad_norm": 0.6851016879081726, "learning_rate": 6.688963210702341e-06, "loss": 0.0228, "step": 13950 }, { "epoch": 9.337792642140467, "grad_norm": 1.128944993019104, "learning_rate": 6.622073578595319e-06, "loss": 0.0166, "step": 13960 }, { "epoch": 9.34448160535117, "grad_norm": 0.764406144618988, "learning_rate": 6.5551839464882945e-06, "loss": 0.0134, "step": 13970 }, { "epoch": 9.351170568561873, "grad_norm": 1.1070597171783447, "learning_rate": 6.488294314381271e-06, "loss": 0.0223, "step": 13980 }, { "epoch": 9.357859531772576, "grad_norm": 2.363032102584839, "learning_rate": 6.421404682274247e-06, "loss": 0.0163, "step": 13990 }, { "epoch": 9.364548494983278, "grad_norm": 2.318466901779175, "learning_rate": 6.354515050167225e-06, "loss": 0.0213, "step": 14000 }, { "epoch": 9.37123745819398, "grad_norm": 2.3102550506591797, "learning_rate": 6.287625418060201e-06, "loss": 0.0194, "step": 14010 }, { "epoch": 9.377926421404682, "grad_norm": 0.9690243601799011, "learning_rate": 6.220735785953177e-06, "loss": 0.0186, "step": 14020 }, { "epoch": 9.384615384615385, "grad_norm": 4.312711238861084, "learning_rate": 6.153846153846155e-06, "loss": 0.021, "step": 14030 }, { "epoch": 9.391304347826088, "grad_norm": 1.6641044616699219, "learning_rate": 6.086956521739131e-06, "loss": 0.0266, "step": 14040 }, { "epoch": 9.397993311036789, "grad_norm": 0.8408416509628296, "learning_rate": 6.0200668896321075e-06, "loss": 0.0162, "step": 14050 }, { "epoch": 9.404682274247492, "grad_norm": 0.6300095319747925, "learning_rate": 5.953177257525084e-06, "loss": 0.0135, "step": 14060 }, { "epoch": 9.411371237458194, "grad_norm": 1.318575143814087, "learning_rate": 5.88628762541806e-06, "loss": 0.0159, "step": 14070 }, { "epoch": 9.418060200668897, "grad_norm": 2.5081329345703125, "learning_rate": 5.819397993311037e-06, "loss": 0.0137, "step": 14080 }, { "epoch": 9.424749163879598, "grad_norm": 2.2743217945098877, "learning_rate": 5.7525083612040135e-06, "loss": 0.0195, "step": 14090 }, { "epoch": 9.431438127090301, "grad_norm": 0.530133068561554, "learning_rate": 5.68561872909699e-06, "loss": 0.0239, "step": 14100 }, { "epoch": 9.438127090301004, "grad_norm": 0.6645513772964478, "learning_rate": 5.618729096989967e-06, "loss": 0.0149, "step": 14110 }, { "epoch": 9.444816053511706, "grad_norm": 0.9330313801765442, "learning_rate": 5.551839464882944e-06, "loss": 0.0256, "step": 14120 }, { "epoch": 9.451505016722408, "grad_norm": 0.29505959153175354, "learning_rate": 5.4849498327759204e-06, "loss": 0.0155, "step": 14130 }, { "epoch": 9.45819397993311, "grad_norm": 0.9493227601051331, "learning_rate": 5.418060200668896e-06, "loss": 0.0184, "step": 14140 }, { "epoch": 9.464882943143813, "grad_norm": 2.8090169429779053, "learning_rate": 5.351170568561873e-06, "loss": 0.0114, "step": 14150 }, { "epoch": 9.471571906354516, "grad_norm": 0.5139400362968445, "learning_rate": 5.28428093645485e-06, "loss": 0.0172, "step": 14160 }, { "epoch": 9.478260869565217, "grad_norm": 0.5248110294342041, "learning_rate": 5.217391304347826e-06, "loss": 0.0149, "step": 14170 }, { "epoch": 9.48494983277592, "grad_norm": 1.211573600769043, "learning_rate": 5.150501672240803e-06, "loss": 0.0204, "step": 14180 }, { "epoch": 9.491638795986622, "grad_norm": 1.8907419443130493, "learning_rate": 5.08361204013378e-06, "loss": 0.0155, "step": 14190 }, { "epoch": 9.498327759197325, "grad_norm": 1.7588986158370972, "learning_rate": 5.016722408026756e-06, "loss": 0.0156, "step": 14200 }, { "epoch": 9.505016722408026, "grad_norm": 0.8350905776023865, "learning_rate": 4.9498327759197325e-06, "loss": 0.0203, "step": 14210 }, { "epoch": 9.511705685618729, "grad_norm": 0.8791153430938721, "learning_rate": 4.882943143812709e-06, "loss": 0.0239, "step": 14220 }, { "epoch": 9.518394648829432, "grad_norm": 3.2824668884277344, "learning_rate": 4.816053511705686e-06, "loss": 0.0278, "step": 14230 }, { "epoch": 9.525083612040135, "grad_norm": 0.2794869542121887, "learning_rate": 4.749163879598662e-06, "loss": 0.0236, "step": 14240 }, { "epoch": 9.531772575250836, "grad_norm": 0.37374019622802734, "learning_rate": 4.682274247491639e-06, "loss": 0.0223, "step": 14250 }, { "epoch": 9.538461538461538, "grad_norm": 0.9982376098632812, "learning_rate": 4.615384615384616e-06, "loss": 0.018, "step": 14260 }, { "epoch": 9.545150501672241, "grad_norm": 0.356210321187973, "learning_rate": 4.548494983277592e-06, "loss": 0.0162, "step": 14270 }, { "epoch": 9.551839464882944, "grad_norm": 0.5105440616607666, "learning_rate": 4.481605351170569e-06, "loss": 0.0252, "step": 14280 }, { "epoch": 9.558528428093645, "grad_norm": 2.1598117351531982, "learning_rate": 4.4147157190635455e-06, "loss": 0.0177, "step": 14290 }, { "epoch": 9.565217391304348, "grad_norm": 1.0519219636917114, "learning_rate": 4.347826086956522e-06, "loss": 0.016, "step": 14300 }, { "epoch": 9.57190635451505, "grad_norm": 0.4185671806335449, "learning_rate": 4.280936454849498e-06, "loss": 0.0271, "step": 14310 }, { "epoch": 9.578595317725753, "grad_norm": 1.2913966178894043, "learning_rate": 4.214046822742475e-06, "loss": 0.0227, "step": 14320 }, { "epoch": 9.585284280936454, "grad_norm": 2.225336790084839, "learning_rate": 4.147157190635452e-06, "loss": 0.0174, "step": 14330 }, { "epoch": 9.591973244147157, "grad_norm": 1.5562801361083984, "learning_rate": 4.080267558528428e-06, "loss": 0.0201, "step": 14340 }, { "epoch": 9.59866220735786, "grad_norm": 1.298086404800415, "learning_rate": 4.013377926421405e-06, "loss": 0.0189, "step": 14350 }, { "epoch": 9.605351170568563, "grad_norm": 0.8389520049095154, "learning_rate": 3.946488294314382e-06, "loss": 0.0206, "step": 14360 }, { "epoch": 9.612040133779264, "grad_norm": 1.0966200828552246, "learning_rate": 3.879598662207358e-06, "loss": 0.0198, "step": 14370 }, { "epoch": 9.618729096989966, "grad_norm": 1.133457899093628, "learning_rate": 3.8127090301003343e-06, "loss": 0.0109, "step": 14380 }, { "epoch": 9.62541806020067, "grad_norm": 1.0652483701705933, "learning_rate": 3.745819397993311e-06, "loss": 0.019, "step": 14390 }, { "epoch": 9.632107023411372, "grad_norm": 1.9861187934875488, "learning_rate": 3.678929765886288e-06, "loss": 0.0271, "step": 14400 }, { "epoch": 9.638795986622073, "grad_norm": 2.377269744873047, "learning_rate": 3.6120401337792645e-06, "loss": 0.021, "step": 14410 }, { "epoch": 9.645484949832776, "grad_norm": 1.7457759380340576, "learning_rate": 3.545150501672241e-06, "loss": 0.0234, "step": 14420 }, { "epoch": 9.652173913043478, "grad_norm": 4.8636884689331055, "learning_rate": 3.4782608695652175e-06, "loss": 0.0197, "step": 14430 }, { "epoch": 9.658862876254181, "grad_norm": 0.41581761837005615, "learning_rate": 3.4113712374581942e-06, "loss": 0.0149, "step": 14440 }, { "epoch": 9.665551839464882, "grad_norm": 0.6414255499839783, "learning_rate": 3.3444816053511705e-06, "loss": 0.0232, "step": 14450 }, { "epoch": 9.672240802675585, "grad_norm": 0.9758875966072083, "learning_rate": 3.2775919732441473e-06, "loss": 0.0246, "step": 14460 }, { "epoch": 9.678929765886288, "grad_norm": 1.265362024307251, "learning_rate": 3.2107023411371236e-06, "loss": 0.0137, "step": 14470 }, { "epoch": 9.68561872909699, "grad_norm": 1.263174295425415, "learning_rate": 3.1438127090301007e-06, "loss": 0.0156, "step": 14480 }, { "epoch": 9.692307692307692, "grad_norm": 2.886913776397705, "learning_rate": 3.0769230769230774e-06, "loss": 0.0175, "step": 14490 }, { "epoch": 9.698996655518394, "grad_norm": 0.6035112142562866, "learning_rate": 3.0100334448160537e-06, "loss": 0.0172, "step": 14500 }, { "epoch": 9.705685618729097, "grad_norm": 1.9714527130126953, "learning_rate": 2.94314381270903e-06, "loss": 0.0191, "step": 14510 }, { "epoch": 9.7123745819398, "grad_norm": 0.6139861345291138, "learning_rate": 2.8762541806020068e-06, "loss": 0.011, "step": 14520 }, { "epoch": 9.719063545150501, "grad_norm": 1.1653517484664917, "learning_rate": 2.8093645484949835e-06, "loss": 0.0269, "step": 14530 }, { "epoch": 9.725752508361204, "grad_norm": 1.1893435716629028, "learning_rate": 2.7424749163879602e-06, "loss": 0.0121, "step": 14540 }, { "epoch": 9.732441471571907, "grad_norm": 0.5057249665260315, "learning_rate": 2.6755852842809365e-06, "loss": 0.03, "step": 14550 }, { "epoch": 9.73913043478261, "grad_norm": 0.7675619125366211, "learning_rate": 2.608695652173913e-06, "loss": 0.0228, "step": 14560 }, { "epoch": 9.74581939799331, "grad_norm": 0.9619132876396179, "learning_rate": 2.54180602006689e-06, "loss": 0.0199, "step": 14570 }, { "epoch": 9.752508361204013, "grad_norm": 1.727118968963623, "learning_rate": 2.4749163879598663e-06, "loss": 0.0255, "step": 14580 }, { "epoch": 9.759197324414716, "grad_norm": 0.8463283777236938, "learning_rate": 2.408026755852843e-06, "loss": 0.0178, "step": 14590 }, { "epoch": 9.765886287625419, "grad_norm": 1.2253174781799316, "learning_rate": 2.3411371237458193e-06, "loss": 0.0201, "step": 14600 }, { "epoch": 9.77257525083612, "grad_norm": 0.49370265007019043, "learning_rate": 2.274247491638796e-06, "loss": 0.0218, "step": 14610 }, { "epoch": 9.779264214046822, "grad_norm": 1.9811979532241821, "learning_rate": 2.2073578595317727e-06, "loss": 0.0242, "step": 14620 }, { "epoch": 9.785953177257525, "grad_norm": 0.901732325553894, "learning_rate": 2.140468227424749e-06, "loss": 0.0163, "step": 14630 }, { "epoch": 9.792642140468228, "grad_norm": 1.5021089315414429, "learning_rate": 2.073578595317726e-06, "loss": 0.0171, "step": 14640 }, { "epoch": 9.799331103678929, "grad_norm": 0.9714933037757874, "learning_rate": 2.0066889632107025e-06, "loss": 0.0175, "step": 14650 }, { "epoch": 9.806020066889632, "grad_norm": 2.403262138366699, "learning_rate": 1.939799331103679e-06, "loss": 0.0261, "step": 14660 }, { "epoch": 9.812709030100335, "grad_norm": 0.4679270386695862, "learning_rate": 1.8729096989966555e-06, "loss": 0.0148, "step": 14670 }, { "epoch": 9.819397993311037, "grad_norm": 0.800066351890564, "learning_rate": 1.8060200668896322e-06, "loss": 0.0122, "step": 14680 }, { "epoch": 9.826086956521738, "grad_norm": 1.5774024724960327, "learning_rate": 1.7391304347826088e-06, "loss": 0.013, "step": 14690 }, { "epoch": 9.832775919732441, "grad_norm": 0.4182083308696747, "learning_rate": 1.6722408026755853e-06, "loss": 0.017, "step": 14700 }, { "epoch": 9.839464882943144, "grad_norm": 2.029318332672119, "learning_rate": 1.6053511705685618e-06, "loss": 0.0318, "step": 14710 }, { "epoch": 9.846153846153847, "grad_norm": 0.7166652083396912, "learning_rate": 1.5384615384615387e-06, "loss": 0.015, "step": 14720 }, { "epoch": 9.852842809364548, "grad_norm": 1.4269553422927856, "learning_rate": 1.471571906354515e-06, "loss": 0.0156, "step": 14730 }, { "epoch": 9.85953177257525, "grad_norm": 2.2864603996276855, "learning_rate": 1.4046822742474917e-06, "loss": 0.0179, "step": 14740 }, { "epoch": 9.866220735785953, "grad_norm": 0.5288352370262146, "learning_rate": 1.3377926421404683e-06, "loss": 0.0147, "step": 14750 }, { "epoch": 9.872909698996656, "grad_norm": 2.4506185054779053, "learning_rate": 1.270903010033445e-06, "loss": 0.0146, "step": 14760 }, { "epoch": 9.879598662207357, "grad_norm": 0.7271729111671448, "learning_rate": 1.2040133779264215e-06, "loss": 0.0187, "step": 14770 }, { "epoch": 9.88628762541806, "grad_norm": 2.37717342376709, "learning_rate": 1.137123745819398e-06, "loss": 0.0194, "step": 14780 }, { "epoch": 9.892976588628763, "grad_norm": 1.2071961164474487, "learning_rate": 1.0702341137123745e-06, "loss": 0.0124, "step": 14790 }, { "epoch": 9.899665551839465, "grad_norm": 0.8646990656852722, "learning_rate": 1.0033444816053512e-06, "loss": 0.026, "step": 14800 }, { "epoch": 9.906354515050166, "grad_norm": 1.959161400794983, "learning_rate": 9.364548494983278e-07, "loss": 0.0171, "step": 14810 }, { "epoch": 9.91304347826087, "grad_norm": 0.8739883303642273, "learning_rate": 8.695652173913044e-07, "loss": 0.0146, "step": 14820 }, { "epoch": 9.919732441471572, "grad_norm": 1.5968456268310547, "learning_rate": 8.026755852842809e-07, "loss": 0.0176, "step": 14830 }, { "epoch": 9.926421404682275, "grad_norm": 0.7556308507919312, "learning_rate": 7.357859531772575e-07, "loss": 0.0199, "step": 14840 }, { "epoch": 9.933110367892976, "grad_norm": 1.8411972522735596, "learning_rate": 6.688963210702341e-07, "loss": 0.0185, "step": 14850 }, { "epoch": 9.939799331103679, "grad_norm": 2.0228002071380615, "learning_rate": 6.020066889632107e-07, "loss": 0.0174, "step": 14860 }, { "epoch": 9.946488294314381, "grad_norm": 0.7559150457382202, "learning_rate": 5.351170568561873e-07, "loss": 0.0239, "step": 14870 }, { "epoch": 9.953177257525084, "grad_norm": 1.5705734491348267, "learning_rate": 4.682274247491639e-07, "loss": 0.0191, "step": 14880 }, { "epoch": 9.959866220735787, "grad_norm": 0.48421409726142883, "learning_rate": 4.0133779264214045e-07, "loss": 0.0183, "step": 14890 }, { "epoch": 9.966555183946488, "grad_norm": 1.3886953592300415, "learning_rate": 3.3444816053511706e-07, "loss": 0.0226, "step": 14900 }, { "epoch": 9.97324414715719, "grad_norm": 0.9066689610481262, "learning_rate": 2.6755852842809363e-07, "loss": 0.0131, "step": 14910 }, { "epoch": 9.979933110367893, "grad_norm": 2.4503183364868164, "learning_rate": 2.0066889632107022e-07, "loss": 0.0182, "step": 14920 }, { "epoch": 9.986622073578594, "grad_norm": 1.642620325088501, "learning_rate": 1.3377926421404682e-07, "loss": 0.0119, "step": 14930 }, { "epoch": 9.993311036789297, "grad_norm": 1.9654911756515503, "learning_rate": 6.688963210702341e-08, "loss": 0.0209, "step": 14940 }, { "epoch": 10.0, "grad_norm": 0.5067110061645508, "learning_rate": 0.0, "loss": 0.0161, "step": 14950 } ], "logging_steps": 10, "max_steps": 14950, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }