{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 2666, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007501875468867217, "grad_norm": 0.13885293900966644, "learning_rate": 3.7453183520599253e-07, "loss": 0.1628, "step": 1 }, { "epoch": 0.0015003750937734434, "grad_norm": 0.18549223244190216, "learning_rate": 7.490636704119851e-07, "loss": 0.2412, "step": 2 }, { "epoch": 0.002250562640660165, "grad_norm": 0.1155385822057724, "learning_rate": 1.1235955056179775e-06, "loss": 0.1319, "step": 3 }, { "epoch": 0.003000750187546887, "grad_norm": 0.15342453122138977, "learning_rate": 1.4981273408239701e-06, "loss": 0.2292, "step": 4 }, { "epoch": 0.0037509377344336083, "grad_norm": 0.12306933850049973, "learning_rate": 1.8726591760299627e-06, "loss": 0.1508, "step": 5 }, { "epoch": 0.00450112528132033, "grad_norm": 0.191707581281662, "learning_rate": 2.247191011235955e-06, "loss": 0.2169, "step": 6 }, { "epoch": 0.005251312828207052, "grad_norm": 0.13260212540626526, "learning_rate": 2.621722846441948e-06, "loss": 0.1464, "step": 7 }, { "epoch": 0.006001500375093774, "grad_norm": 0.10345903784036636, "learning_rate": 2.9962546816479402e-06, "loss": 0.1349, "step": 8 }, { "epoch": 0.006751687921980495, "grad_norm": 0.10904919356107712, "learning_rate": 3.3707865168539327e-06, "loss": 0.1499, "step": 9 }, { "epoch": 0.007501875468867217, "grad_norm": 0.12517283856868744, "learning_rate": 3.7453183520599255e-06, "loss": 0.18, "step": 10 }, { "epoch": 0.008252063015753939, "grad_norm": 0.12791502475738525, "learning_rate": 4.1198501872659175e-06, "loss": 0.101, "step": 11 }, { "epoch": 0.00900225056264066, "grad_norm": 0.17475782334804535, "learning_rate": 4.49438202247191e-06, "loss": 0.2105, "step": 12 }, { "epoch": 0.009752438109527382, "grad_norm": 0.17885951697826385, "learning_rate": 4.868913857677903e-06, "loss": 0.2074, "step": 13 }, { "epoch": 0.010502625656414103, "grad_norm": 0.3866336941719055, "learning_rate": 5.243445692883896e-06, "loss": 0.3252, "step": 14 }, { "epoch": 0.011252813203300824, "grad_norm": 0.13714569807052612, "learning_rate": 5.617977528089888e-06, "loss": 0.1515, "step": 15 }, { "epoch": 0.012003000750187547, "grad_norm": 0.2914994955062866, "learning_rate": 5.9925093632958805e-06, "loss": 0.1959, "step": 16 }, { "epoch": 0.012753188297074268, "grad_norm": 0.09515587240457535, "learning_rate": 6.367041198501873e-06, "loss": 0.0907, "step": 17 }, { "epoch": 0.01350337584396099, "grad_norm": 0.11435846239328384, "learning_rate": 6.741573033707865e-06, "loss": 0.1067, "step": 18 }, { "epoch": 0.014253563390847712, "grad_norm": 0.11161951720714569, "learning_rate": 7.116104868913858e-06, "loss": 0.1085, "step": 19 }, { "epoch": 0.015003750937734433, "grad_norm": 0.2902528941631317, "learning_rate": 7.490636704119851e-06, "loss": 0.3206, "step": 20 }, { "epoch": 0.015753938484621154, "grad_norm": 0.1824679672718048, "learning_rate": 7.865168539325843e-06, "loss": 0.1762, "step": 21 }, { "epoch": 0.016504126031507877, "grad_norm": 0.19523698091506958, "learning_rate": 8.239700374531835e-06, "loss": 0.2151, "step": 22 }, { "epoch": 0.0172543135783946, "grad_norm": 0.18187110126018524, "learning_rate": 8.614232209737828e-06, "loss": 0.1724, "step": 23 }, { "epoch": 0.01800450112528132, "grad_norm": 0.2814335823059082, "learning_rate": 8.98876404494382e-06, "loss": 0.1468, "step": 24 }, { "epoch": 0.018754688672168042, "grad_norm": 0.19284458458423615, "learning_rate": 9.363295880149813e-06, "loss": 0.1513, "step": 25 }, { "epoch": 0.019504876219054765, "grad_norm": 0.1800457239151001, "learning_rate": 9.737827715355806e-06, "loss": 0.1771, "step": 26 }, { "epoch": 0.020255063765941484, "grad_norm": 0.14044098556041718, "learning_rate": 1.0112359550561798e-05, "loss": 0.1252, "step": 27 }, { "epoch": 0.021005251312828207, "grad_norm": 0.30116593837738037, "learning_rate": 1.0486891385767791e-05, "loss": 0.2511, "step": 28 }, { "epoch": 0.02175543885971493, "grad_norm": 0.144786074757576, "learning_rate": 1.0861423220973783e-05, "loss": 0.1473, "step": 29 }, { "epoch": 0.02250562640660165, "grad_norm": 0.1813868284225464, "learning_rate": 1.1235955056179776e-05, "loss": 0.1257, "step": 30 }, { "epoch": 0.023255813953488372, "grad_norm": 0.14789240062236786, "learning_rate": 1.161048689138577e-05, "loss": 0.1564, "step": 31 }, { "epoch": 0.024006001500375095, "grad_norm": 0.11978735774755478, "learning_rate": 1.1985018726591761e-05, "loss": 0.0879, "step": 32 }, { "epoch": 0.024756189047261814, "grad_norm": 0.24735777080059052, "learning_rate": 1.2359550561797752e-05, "loss": 0.1878, "step": 33 }, { "epoch": 0.025506376594148537, "grad_norm": 0.16269366443157196, "learning_rate": 1.2734082397003746e-05, "loss": 0.1552, "step": 34 }, { "epoch": 0.02625656414103526, "grad_norm": 0.25208044052124023, "learning_rate": 1.3108614232209737e-05, "loss": 0.227, "step": 35 }, { "epoch": 0.02700675168792198, "grad_norm": 0.16651014983654022, "learning_rate": 1.348314606741573e-05, "loss": 0.1223, "step": 36 }, { "epoch": 0.0277569392348087, "grad_norm": 0.21705418825149536, "learning_rate": 1.3857677902621724e-05, "loss": 0.1634, "step": 37 }, { "epoch": 0.028507126781695424, "grad_norm": 0.1602638065814972, "learning_rate": 1.4232209737827715e-05, "loss": 0.1174, "step": 38 }, { "epoch": 0.029257314328582147, "grad_norm": 0.37907129526138306, "learning_rate": 1.4606741573033709e-05, "loss": 0.2483, "step": 39 }, { "epoch": 0.030007501875468866, "grad_norm": 0.13058216869831085, "learning_rate": 1.4981273408239702e-05, "loss": 0.117, "step": 40 }, { "epoch": 0.03075768942235559, "grad_norm": 0.11338390409946442, "learning_rate": 1.5355805243445692e-05, "loss": 0.1113, "step": 41 }, { "epoch": 0.03150787696924231, "grad_norm": 0.2932577133178711, "learning_rate": 1.5730337078651687e-05, "loss": 0.1604, "step": 42 }, { "epoch": 0.03225806451612903, "grad_norm": 0.17441685497760773, "learning_rate": 1.610486891385768e-05, "loss": 0.1427, "step": 43 }, { "epoch": 0.033008252063015754, "grad_norm": 0.2918965518474579, "learning_rate": 1.647940074906367e-05, "loss": 0.14, "step": 44 }, { "epoch": 0.03375843960990248, "grad_norm": 0.2671569883823395, "learning_rate": 1.6853932584269665e-05, "loss": 0.1558, "step": 45 }, { "epoch": 0.0345086271567892, "grad_norm": 0.2006506770849228, "learning_rate": 1.7228464419475657e-05, "loss": 0.1355, "step": 46 }, { "epoch": 0.035258814703675916, "grad_norm": 0.15463334321975708, "learning_rate": 1.760299625468165e-05, "loss": 0.1249, "step": 47 }, { "epoch": 0.03600900225056264, "grad_norm": 0.394148588180542, "learning_rate": 1.797752808988764e-05, "loss": 0.2662, "step": 48 }, { "epoch": 0.03675918979744936, "grad_norm": 0.32093098759651184, "learning_rate": 1.8352059925093635e-05, "loss": 0.2, "step": 49 }, { "epoch": 0.037509377344336084, "grad_norm": 0.127495676279068, "learning_rate": 1.8726591760299626e-05, "loss": 0.1226, "step": 50 }, { "epoch": 0.03825956489122281, "grad_norm": 0.1350037157535553, "learning_rate": 1.9101123595505618e-05, "loss": 0.0691, "step": 51 }, { "epoch": 0.03900975243810953, "grad_norm": 0.24726983904838562, "learning_rate": 1.9475655430711613e-05, "loss": 0.1451, "step": 52 }, { "epoch": 0.03975993998499625, "grad_norm": 0.30878591537475586, "learning_rate": 1.9850187265917604e-05, "loss": 0.2132, "step": 53 }, { "epoch": 0.04051012753188297, "grad_norm": 0.2271026372909546, "learning_rate": 2.0224719101123596e-05, "loss": 0.132, "step": 54 }, { "epoch": 0.04126031507876969, "grad_norm": 0.2296169400215149, "learning_rate": 2.059925093632959e-05, "loss": 0.1274, "step": 55 }, { "epoch": 0.042010502625656414, "grad_norm": 0.1105537861585617, "learning_rate": 2.0973782771535582e-05, "loss": 0.0744, "step": 56 }, { "epoch": 0.04276069017254314, "grad_norm": 0.2400723546743393, "learning_rate": 2.1348314606741574e-05, "loss": 0.1027, "step": 57 }, { "epoch": 0.04351087771942986, "grad_norm": 0.5178837180137634, "learning_rate": 2.1722846441947566e-05, "loss": 0.1871, "step": 58 }, { "epoch": 0.04426106526631658, "grad_norm": 0.16341057419776917, "learning_rate": 2.209737827715356e-05, "loss": 0.0965, "step": 59 }, { "epoch": 0.0450112528132033, "grad_norm": 0.1322585940361023, "learning_rate": 2.2471910112359552e-05, "loss": 0.1001, "step": 60 }, { "epoch": 0.04576144036009002, "grad_norm": 0.20453320443630219, "learning_rate": 2.2846441947565544e-05, "loss": 0.1281, "step": 61 }, { "epoch": 0.046511627906976744, "grad_norm": 0.16821111738681793, "learning_rate": 2.322097378277154e-05, "loss": 0.0779, "step": 62 }, { "epoch": 0.047261815453863466, "grad_norm": 0.23807474970817566, "learning_rate": 2.359550561797753e-05, "loss": 0.16, "step": 63 }, { "epoch": 0.04801200300075019, "grad_norm": 0.22742918133735657, "learning_rate": 2.3970037453183522e-05, "loss": 0.12, "step": 64 }, { "epoch": 0.04876219054763691, "grad_norm": 0.15059584379196167, "learning_rate": 2.4344569288389517e-05, "loss": 0.085, "step": 65 }, { "epoch": 0.04951237809452363, "grad_norm": 0.4061318635940552, "learning_rate": 2.4719101123595505e-05, "loss": 0.107, "step": 66 }, { "epoch": 0.05026256564141035, "grad_norm": 0.14806576073169708, "learning_rate": 2.50936329588015e-05, "loss": 0.0733, "step": 67 }, { "epoch": 0.05101275318829707, "grad_norm": 0.22779157757759094, "learning_rate": 2.546816479400749e-05, "loss": 0.0834, "step": 68 }, { "epoch": 0.051762940735183796, "grad_norm": 0.14479000866413116, "learning_rate": 2.5842696629213486e-05, "loss": 0.1031, "step": 69 }, { "epoch": 0.05251312828207052, "grad_norm": 0.134308323264122, "learning_rate": 2.6217228464419475e-05, "loss": 0.0768, "step": 70 }, { "epoch": 0.05326331582895724, "grad_norm": 0.2619442641735077, "learning_rate": 2.6591760299625466e-05, "loss": 0.1374, "step": 71 }, { "epoch": 0.05401350337584396, "grad_norm": 0.15653356909751892, "learning_rate": 2.696629213483146e-05, "loss": 0.1015, "step": 72 }, { "epoch": 0.05476369092273068, "grad_norm": 0.16244176030158997, "learning_rate": 2.7340823970037456e-05, "loss": 0.0686, "step": 73 }, { "epoch": 0.0555138784696174, "grad_norm": 0.2526662051677704, "learning_rate": 2.7715355805243448e-05, "loss": 0.1234, "step": 74 }, { "epoch": 0.056264066016504126, "grad_norm": 0.22778008878231049, "learning_rate": 2.8089887640449443e-05, "loss": 0.1534, "step": 75 }, { "epoch": 0.05701425356339085, "grad_norm": 0.27490678429603577, "learning_rate": 2.846441947565543e-05, "loss": 0.1303, "step": 76 }, { "epoch": 0.05776444111027757, "grad_norm": 0.13964544236660004, "learning_rate": 2.8838951310861422e-05, "loss": 0.0939, "step": 77 }, { "epoch": 0.058514628657164294, "grad_norm": 0.20751768350601196, "learning_rate": 2.9213483146067417e-05, "loss": 0.1046, "step": 78 }, { "epoch": 0.05926481620405101, "grad_norm": 0.2151343822479248, "learning_rate": 2.958801498127341e-05, "loss": 0.0932, "step": 79 }, { "epoch": 0.06001500375093773, "grad_norm": 0.13716846704483032, "learning_rate": 2.9962546816479404e-05, "loss": 0.0689, "step": 80 }, { "epoch": 0.060765191297824456, "grad_norm": 0.204776793718338, "learning_rate": 3.0337078651685396e-05, "loss": 0.0959, "step": 81 }, { "epoch": 0.06151537884471118, "grad_norm": 0.37804269790649414, "learning_rate": 3.0711610486891384e-05, "loss": 0.1014, "step": 82 }, { "epoch": 0.0622655663915979, "grad_norm": 0.2825986444950104, "learning_rate": 3.108614232209738e-05, "loss": 0.1371, "step": 83 }, { "epoch": 0.06301575393848462, "grad_norm": 0.1622900366783142, "learning_rate": 3.1460674157303374e-05, "loss": 0.0767, "step": 84 }, { "epoch": 0.06376594148537135, "grad_norm": 0.21700279414653778, "learning_rate": 3.183520599250936e-05, "loss": 0.0826, "step": 85 }, { "epoch": 0.06451612903225806, "grad_norm": 0.14375083148479462, "learning_rate": 3.220973782771536e-05, "loss": 0.0766, "step": 86 }, { "epoch": 0.06526631657914479, "grad_norm": 0.20387856662273407, "learning_rate": 3.258426966292135e-05, "loss": 0.091, "step": 87 }, { "epoch": 0.06601650412603151, "grad_norm": 0.20541413128376007, "learning_rate": 3.295880149812734e-05, "loss": 0.1184, "step": 88 }, { "epoch": 0.06676669167291822, "grad_norm": 0.1971144676208496, "learning_rate": 3.3333333333333335e-05, "loss": 0.1029, "step": 89 }, { "epoch": 0.06751687921980495, "grad_norm": 0.18307439982891083, "learning_rate": 3.370786516853933e-05, "loss": 0.0993, "step": 90 }, { "epoch": 0.06826706676669167, "grad_norm": 0.1350952833890915, "learning_rate": 3.408239700374532e-05, "loss": 0.0529, "step": 91 }, { "epoch": 0.0690172543135784, "grad_norm": 0.15601865947246552, "learning_rate": 3.445692883895131e-05, "loss": 0.0772, "step": 92 }, { "epoch": 0.06976744186046512, "grad_norm": 0.09120382368564606, "learning_rate": 3.483146067415731e-05, "loss": 0.055, "step": 93 }, { "epoch": 0.07051762940735183, "grad_norm": 0.16563227772712708, "learning_rate": 3.52059925093633e-05, "loss": 0.0651, "step": 94 }, { "epoch": 0.07126781695423856, "grad_norm": 0.19384774565696716, "learning_rate": 3.558052434456929e-05, "loss": 0.0749, "step": 95 }, { "epoch": 0.07201800450112528, "grad_norm": 0.1468099057674408, "learning_rate": 3.595505617977528e-05, "loss": 0.0673, "step": 96 }, { "epoch": 0.072768192048012, "grad_norm": 0.30378851294517517, "learning_rate": 3.6329588014981274e-05, "loss": 0.0638, "step": 97 }, { "epoch": 0.07351837959489872, "grad_norm": 0.2536369562149048, "learning_rate": 3.670411985018727e-05, "loss": 0.0932, "step": 98 }, { "epoch": 0.07426856714178545, "grad_norm": 0.1923932582139969, "learning_rate": 3.7078651685393264e-05, "loss": 0.0732, "step": 99 }, { "epoch": 0.07501875468867217, "grad_norm": 0.20941613614559174, "learning_rate": 3.745318352059925e-05, "loss": 0.0971, "step": 100 }, { "epoch": 0.07576894223555888, "grad_norm": 0.17410755157470703, "learning_rate": 3.782771535580524e-05, "loss": 0.081, "step": 101 }, { "epoch": 0.07651912978244561, "grad_norm": 0.1977808177471161, "learning_rate": 3.8202247191011236e-05, "loss": 0.0933, "step": 102 }, { "epoch": 0.07726931732933233, "grad_norm": 0.25273486971855164, "learning_rate": 3.857677902621723e-05, "loss": 0.0739, "step": 103 }, { "epoch": 0.07801950487621906, "grad_norm": 0.29576393961906433, "learning_rate": 3.8951310861423226e-05, "loss": 0.0938, "step": 104 }, { "epoch": 0.07876969242310577, "grad_norm": 0.1943545937538147, "learning_rate": 3.9325842696629214e-05, "loss": 0.0993, "step": 105 }, { "epoch": 0.0795198799699925, "grad_norm": 0.2052256166934967, "learning_rate": 3.970037453183521e-05, "loss": 0.0803, "step": 106 }, { "epoch": 0.08027006751687922, "grad_norm": 0.2309311181306839, "learning_rate": 4.00749063670412e-05, "loss": 0.0663, "step": 107 }, { "epoch": 0.08102025506376594, "grad_norm": 0.22197790443897247, "learning_rate": 4.044943820224719e-05, "loss": 0.1032, "step": 108 }, { "epoch": 0.08177044261065267, "grad_norm": 0.13494251668453217, "learning_rate": 4.082397003745319e-05, "loss": 0.0557, "step": 109 }, { "epoch": 0.08252063015753938, "grad_norm": 0.2530130445957184, "learning_rate": 4.119850187265918e-05, "loss": 0.0802, "step": 110 }, { "epoch": 0.08327081770442611, "grad_norm": 0.20438532531261444, "learning_rate": 4.157303370786517e-05, "loss": 0.1289, "step": 111 }, { "epoch": 0.08402100525131283, "grad_norm": 0.1981775015592575, "learning_rate": 4.1947565543071165e-05, "loss": 0.0578, "step": 112 }, { "epoch": 0.08477119279819954, "grad_norm": 0.30348509550094604, "learning_rate": 4.232209737827715e-05, "loss": 0.0826, "step": 113 }, { "epoch": 0.08552138034508627, "grad_norm": 0.21688346564769745, "learning_rate": 4.269662921348315e-05, "loss": 0.0693, "step": 114 }, { "epoch": 0.08627156789197299, "grad_norm": 0.21740299463272095, "learning_rate": 4.307116104868914e-05, "loss": 0.0788, "step": 115 }, { "epoch": 0.08702175543885972, "grad_norm": 0.29491832852363586, "learning_rate": 4.344569288389513e-05, "loss": 0.084, "step": 116 }, { "epoch": 0.08777194298574643, "grad_norm": 0.1748553067445755, "learning_rate": 4.3820224719101126e-05, "loss": 0.0671, "step": 117 }, { "epoch": 0.08852213053263316, "grad_norm": 0.19076122343540192, "learning_rate": 4.419475655430712e-05, "loss": 0.0715, "step": 118 }, { "epoch": 0.08927231807951988, "grad_norm": 0.2707376778125763, "learning_rate": 4.456928838951311e-05, "loss": 0.1027, "step": 119 }, { "epoch": 0.0900225056264066, "grad_norm": 0.1251923143863678, "learning_rate": 4.4943820224719104e-05, "loss": 0.0594, "step": 120 }, { "epoch": 0.09077269317329333, "grad_norm": 0.18228654563426971, "learning_rate": 4.531835205992509e-05, "loss": 0.0581, "step": 121 }, { "epoch": 0.09152288072018004, "grad_norm": 0.20524358749389648, "learning_rate": 4.569288389513109e-05, "loss": 0.0627, "step": 122 }, { "epoch": 0.09227306826706677, "grad_norm": 0.10450001060962677, "learning_rate": 4.606741573033708e-05, "loss": 0.0731, "step": 123 }, { "epoch": 0.09302325581395349, "grad_norm": 0.18328773975372314, "learning_rate": 4.644194756554308e-05, "loss": 0.092, "step": 124 }, { "epoch": 0.09377344336084022, "grad_norm": 0.208187073469162, "learning_rate": 4.6816479400749066e-05, "loss": 0.0639, "step": 125 }, { "epoch": 0.09452363090772693, "grad_norm": 0.2710714638233185, "learning_rate": 4.719101123595506e-05, "loss": 0.0841, "step": 126 }, { "epoch": 0.09527381845461365, "grad_norm": 0.2559587359428406, "learning_rate": 4.756554307116105e-05, "loss": 0.0984, "step": 127 }, { "epoch": 0.09602400600150038, "grad_norm": 0.16492657363414764, "learning_rate": 4.7940074906367044e-05, "loss": 0.0491, "step": 128 }, { "epoch": 0.0967741935483871, "grad_norm": 0.15822099149227142, "learning_rate": 4.831460674157304e-05, "loss": 0.0395, "step": 129 }, { "epoch": 0.09752438109527382, "grad_norm": 0.1362650990486145, "learning_rate": 4.8689138576779034e-05, "loss": 0.0516, "step": 130 }, { "epoch": 0.09827456864216054, "grad_norm": 0.28588271141052246, "learning_rate": 4.906367041198502e-05, "loss": 0.1132, "step": 131 }, { "epoch": 0.09902475618904726, "grad_norm": 0.3099716305732727, "learning_rate": 4.943820224719101e-05, "loss": 0.0726, "step": 132 }, { "epoch": 0.09977494373593399, "grad_norm": 0.10133063048124313, "learning_rate": 4.9812734082397005e-05, "loss": 0.0515, "step": 133 }, { "epoch": 0.1005251312828207, "grad_norm": 0.18876691162586212, "learning_rate": 5.0187265917603e-05, "loss": 0.0918, "step": 134 }, { "epoch": 0.10127531882970743, "grad_norm": 0.3820139765739441, "learning_rate": 5.0561797752808995e-05, "loss": 0.1141, "step": 135 }, { "epoch": 0.10202550637659415, "grad_norm": 0.26029330492019653, "learning_rate": 5.093632958801498e-05, "loss": 0.0843, "step": 136 }, { "epoch": 0.10277569392348088, "grad_norm": 0.17507338523864746, "learning_rate": 5.131086142322098e-05, "loss": 0.0433, "step": 137 }, { "epoch": 0.10352588147036759, "grad_norm": 0.2125600278377533, "learning_rate": 5.168539325842697e-05, "loss": 0.0931, "step": 138 }, { "epoch": 0.10427606901725431, "grad_norm": 0.2325267344713211, "learning_rate": 5.205992509363297e-05, "loss": 0.048, "step": 139 }, { "epoch": 0.10502625656414104, "grad_norm": 0.1814715564250946, "learning_rate": 5.243445692883895e-05, "loss": 0.0708, "step": 140 }, { "epoch": 0.10577644411102775, "grad_norm": 0.13349854946136475, "learning_rate": 5.2808988764044944e-05, "loss": 0.0653, "step": 141 }, { "epoch": 0.10652663165791448, "grad_norm": 0.13655948638916016, "learning_rate": 5.318352059925093e-05, "loss": 0.0407, "step": 142 }, { "epoch": 0.1072768192048012, "grad_norm": 0.24276535212993622, "learning_rate": 5.355805243445693e-05, "loss": 0.0837, "step": 143 }, { "epoch": 0.10802700675168792, "grad_norm": 0.20750541985034943, "learning_rate": 5.393258426966292e-05, "loss": 0.0766, "step": 144 }, { "epoch": 0.10877719429857464, "grad_norm": 0.37364712357521057, "learning_rate": 5.430711610486892e-05, "loss": 0.0974, "step": 145 }, { "epoch": 0.10952738184546136, "grad_norm": 0.44260385632514954, "learning_rate": 5.468164794007491e-05, "loss": 0.0679, "step": 146 }, { "epoch": 0.11027756939234809, "grad_norm": 0.21232500672340393, "learning_rate": 5.50561797752809e-05, "loss": 0.0753, "step": 147 }, { "epoch": 0.1110277569392348, "grad_norm": 0.4461332857608795, "learning_rate": 5.5430711610486895e-05, "loss": 0.1543, "step": 148 }, { "epoch": 0.11177794448612154, "grad_norm": 0.26154395937919617, "learning_rate": 5.580524344569289e-05, "loss": 0.1068, "step": 149 }, { "epoch": 0.11252813203300825, "grad_norm": 0.22164535522460938, "learning_rate": 5.6179775280898885e-05, "loss": 0.0687, "step": 150 }, { "epoch": 0.11327831957989497, "grad_norm": 0.23080644011497498, "learning_rate": 5.6554307116104874e-05, "loss": 0.0623, "step": 151 }, { "epoch": 0.1140285071267817, "grad_norm": 0.20958930253982544, "learning_rate": 5.692883895131086e-05, "loss": 0.058, "step": 152 }, { "epoch": 0.11477869467366841, "grad_norm": 0.14259661734104156, "learning_rate": 5.730337078651685e-05, "loss": 0.0608, "step": 153 }, { "epoch": 0.11552888222055514, "grad_norm": 0.14598913490772247, "learning_rate": 5.7677902621722845e-05, "loss": 0.0652, "step": 154 }, { "epoch": 0.11627906976744186, "grad_norm": 0.11540807038545609, "learning_rate": 5.805243445692884e-05, "loss": 0.0338, "step": 155 }, { "epoch": 0.11702925731432859, "grad_norm": 0.1490037888288498, "learning_rate": 5.8426966292134835e-05, "loss": 0.0489, "step": 156 }, { "epoch": 0.1177794448612153, "grad_norm": 0.16232560575008392, "learning_rate": 5.880149812734082e-05, "loss": 0.0514, "step": 157 }, { "epoch": 0.11852963240810202, "grad_norm": 0.22876153886318207, "learning_rate": 5.917602996254682e-05, "loss": 0.0899, "step": 158 }, { "epoch": 0.11927981995498875, "grad_norm": 0.2434694468975067, "learning_rate": 5.955056179775281e-05, "loss": 0.0626, "step": 159 }, { "epoch": 0.12003000750187547, "grad_norm": 0.29315224289894104, "learning_rate": 5.992509363295881e-05, "loss": 0.0916, "step": 160 }, { "epoch": 0.1207801950487622, "grad_norm": 0.16403856873512268, "learning_rate": 6.02996254681648e-05, "loss": 0.0608, "step": 161 }, { "epoch": 0.12153038259564891, "grad_norm": 0.16882139444351196, "learning_rate": 6.067415730337079e-05, "loss": 0.0986, "step": 162 }, { "epoch": 0.12228057014253563, "grad_norm": 0.1665971726179123, "learning_rate": 6.104868913857679e-05, "loss": 0.0773, "step": 163 }, { "epoch": 0.12303075768942236, "grad_norm": 0.16212251782417297, "learning_rate": 6.142322097378277e-05, "loss": 0.0775, "step": 164 }, { "epoch": 0.12378094523630907, "grad_norm": 0.12361068278551102, "learning_rate": 6.179775280898876e-05, "loss": 0.0535, "step": 165 }, { "epoch": 0.1245311327831958, "grad_norm": 0.12108980119228363, "learning_rate": 6.217228464419476e-05, "loss": 0.0474, "step": 166 }, { "epoch": 0.12528132033008252, "grad_norm": 0.12282898277044296, "learning_rate": 6.254681647940075e-05, "loss": 0.0419, "step": 167 }, { "epoch": 0.12603150787696923, "grad_norm": 0.12157752364873886, "learning_rate": 6.292134831460675e-05, "loss": 0.0481, "step": 168 }, { "epoch": 0.12678169542385595, "grad_norm": 0.17761266231536865, "learning_rate": 6.329588014981274e-05, "loss": 0.0621, "step": 169 }, { "epoch": 0.1275318829707427, "grad_norm": 0.19057996571063995, "learning_rate": 6.367041198501872e-05, "loss": 0.0649, "step": 170 }, { "epoch": 0.1282820705176294, "grad_norm": 0.2076871246099472, "learning_rate": 6.404494382022472e-05, "loss": 0.077, "step": 171 }, { "epoch": 0.12903225806451613, "grad_norm": 0.19672785699367523, "learning_rate": 6.441947565543071e-05, "loss": 0.067, "step": 172 }, { "epoch": 0.12978244561140284, "grad_norm": 0.19830764830112457, "learning_rate": 6.479400749063671e-05, "loss": 0.068, "step": 173 }, { "epoch": 0.13053263315828958, "grad_norm": 0.160996675491333, "learning_rate": 6.51685393258427e-05, "loss": 0.0613, "step": 174 }, { "epoch": 0.1312828207051763, "grad_norm": 0.14649710059165955, "learning_rate": 6.55430711610487e-05, "loss": 0.0432, "step": 175 }, { "epoch": 0.13203300825206302, "grad_norm": 0.22120599448680878, "learning_rate": 6.591760299625468e-05, "loss": 0.0517, "step": 176 }, { "epoch": 0.13278319579894973, "grad_norm": 0.17631229758262634, "learning_rate": 6.629213483146067e-05, "loss": 0.0816, "step": 177 }, { "epoch": 0.13353338334583645, "grad_norm": 0.2152678519487381, "learning_rate": 6.666666666666667e-05, "loss": 0.0522, "step": 178 }, { "epoch": 0.1342835708927232, "grad_norm": 0.22123490273952484, "learning_rate": 6.704119850187266e-05, "loss": 0.0771, "step": 179 }, { "epoch": 0.1350337584396099, "grad_norm": 0.37060683965682983, "learning_rate": 6.741573033707866e-05, "loss": 0.0641, "step": 180 }, { "epoch": 0.13578394598649662, "grad_norm": 0.26469045877456665, "learning_rate": 6.779026217228464e-05, "loss": 0.0889, "step": 181 }, { "epoch": 0.13653413353338334, "grad_norm": 0.19371746480464935, "learning_rate": 6.816479400749064e-05, "loss": 0.0506, "step": 182 }, { "epoch": 0.13728432108027006, "grad_norm": 0.1641053408384323, "learning_rate": 6.853932584269663e-05, "loss": 0.0672, "step": 183 }, { "epoch": 0.1380345086271568, "grad_norm": 0.2839615046977997, "learning_rate": 6.891385767790263e-05, "loss": 0.0929, "step": 184 }, { "epoch": 0.13878469617404351, "grad_norm": 0.20816008746623993, "learning_rate": 6.928838951310862e-05, "loss": 0.0597, "step": 185 }, { "epoch": 0.13953488372093023, "grad_norm": 0.1270139068365097, "learning_rate": 6.966292134831462e-05, "loss": 0.0484, "step": 186 }, { "epoch": 0.14028507126781695, "grad_norm": 0.11964840441942215, "learning_rate": 7.003745318352061e-05, "loss": 0.0459, "step": 187 }, { "epoch": 0.14103525881470366, "grad_norm": 0.2520076334476471, "learning_rate": 7.04119850187266e-05, "loss": 0.1065, "step": 188 }, { "epoch": 0.1417854463615904, "grad_norm": 0.23095941543579102, "learning_rate": 7.078651685393259e-05, "loss": 0.0969, "step": 189 }, { "epoch": 0.14253563390847712, "grad_norm": 0.2514033913612366, "learning_rate": 7.116104868913858e-05, "loss": 0.0812, "step": 190 }, { "epoch": 0.14328582145536384, "grad_norm": 0.20976576209068298, "learning_rate": 7.153558052434456e-05, "loss": 0.0905, "step": 191 }, { "epoch": 0.14403600900225055, "grad_norm": 0.18989625573158264, "learning_rate": 7.191011235955056e-05, "loss": 0.062, "step": 192 }, { "epoch": 0.1447861965491373, "grad_norm": 0.1842532753944397, "learning_rate": 7.228464419475655e-05, "loss": 0.0588, "step": 193 }, { "epoch": 0.145536384096024, "grad_norm": 0.2708159387111664, "learning_rate": 7.265917602996255e-05, "loss": 0.1267, "step": 194 }, { "epoch": 0.14628657164291073, "grad_norm": 0.15724222362041473, "learning_rate": 7.303370786516854e-05, "loss": 0.066, "step": 195 }, { "epoch": 0.14703675918979744, "grad_norm": 0.13891983032226562, "learning_rate": 7.340823970037454e-05, "loss": 0.0687, "step": 196 }, { "epoch": 0.14778694673668416, "grad_norm": 0.10517305880784988, "learning_rate": 7.378277153558053e-05, "loss": 0.0509, "step": 197 }, { "epoch": 0.1485371342835709, "grad_norm": 0.26864030957221985, "learning_rate": 7.415730337078653e-05, "loss": 0.0655, "step": 198 }, { "epoch": 0.14928732183045762, "grad_norm": 0.20357665419578552, "learning_rate": 7.453183520599252e-05, "loss": 0.0651, "step": 199 }, { "epoch": 0.15003750937734434, "grad_norm": 0.20443439483642578, "learning_rate": 7.49063670411985e-05, "loss": 0.093, "step": 200 }, { "epoch": 0.15003750937734434, "eval_loss": 0.07821805030107498, "eval_runtime": 2.6636, "eval_samples_per_second": 20.273, "eval_steps_per_second": 5.256, "step": 200 }, { "epoch": 0.15078769692423105, "grad_norm": 0.2003447413444519, "learning_rate": 7.52808988764045e-05, "loss": 0.0846, "step": 201 }, { "epoch": 0.15153788447111777, "grad_norm": 0.148345485329628, "learning_rate": 7.565543071161048e-05, "loss": 0.0649, "step": 202 }, { "epoch": 0.1522880720180045, "grad_norm": 0.11558017134666443, "learning_rate": 7.602996254681648e-05, "loss": 0.0565, "step": 203 }, { "epoch": 0.15303825956489123, "grad_norm": 0.197371244430542, "learning_rate": 7.640449438202247e-05, "loss": 0.0704, "step": 204 }, { "epoch": 0.15378844711177794, "grad_norm": 0.1327037215232849, "learning_rate": 7.677902621722847e-05, "loss": 0.049, "step": 205 }, { "epoch": 0.15453863465866466, "grad_norm": 0.1790093332529068, "learning_rate": 7.715355805243446e-05, "loss": 0.0799, "step": 206 }, { "epoch": 0.15528882220555137, "grad_norm": 0.13199618458747864, "learning_rate": 7.752808988764046e-05, "loss": 0.0499, "step": 207 }, { "epoch": 0.15603900975243812, "grad_norm": 0.2246326208114624, "learning_rate": 7.790262172284645e-05, "loss": 0.0972, "step": 208 }, { "epoch": 0.15678919729932483, "grad_norm": 0.13354603946208954, "learning_rate": 7.827715355805245e-05, "loss": 0.0718, "step": 209 }, { "epoch": 0.15753938484621155, "grad_norm": 0.12375898659229279, "learning_rate": 7.865168539325843e-05, "loss": 0.0659, "step": 210 }, { "epoch": 0.15828957239309827, "grad_norm": 0.2637913227081299, "learning_rate": 7.902621722846442e-05, "loss": 0.0918, "step": 211 }, { "epoch": 0.159039759939985, "grad_norm": 0.23306505382061005, "learning_rate": 7.940074906367042e-05, "loss": 0.0789, "step": 212 }, { "epoch": 0.15978994748687173, "grad_norm": 0.1299886852502823, "learning_rate": 7.97752808988764e-05, "loss": 0.0605, "step": 213 }, { "epoch": 0.16054013503375844, "grad_norm": 0.21600346267223358, "learning_rate": 8.01498127340824e-05, "loss": 0.0624, "step": 214 }, { "epoch": 0.16129032258064516, "grad_norm": 0.16830679774284363, "learning_rate": 8.052434456928839e-05, "loss": 0.0756, "step": 215 }, { "epoch": 0.16204051012753187, "grad_norm": 0.4400419294834137, "learning_rate": 8.089887640449438e-05, "loss": 0.1069, "step": 216 }, { "epoch": 0.16279069767441862, "grad_norm": 0.15384125709533691, "learning_rate": 8.127340823970038e-05, "loss": 0.0756, "step": 217 }, { "epoch": 0.16354088522130533, "grad_norm": 0.1943519562482834, "learning_rate": 8.164794007490637e-05, "loss": 0.0847, "step": 218 }, { "epoch": 0.16429107276819205, "grad_norm": 0.17785924673080444, "learning_rate": 8.202247191011237e-05, "loss": 0.0641, "step": 219 }, { "epoch": 0.16504126031507876, "grad_norm": 0.342724472284317, "learning_rate": 8.239700374531836e-05, "loss": 0.1031, "step": 220 }, { "epoch": 0.16579144786196548, "grad_norm": 0.18801873922348022, "learning_rate": 8.277153558052434e-05, "loss": 0.0822, "step": 221 }, { "epoch": 0.16654163540885222, "grad_norm": 0.14902432262897491, "learning_rate": 8.314606741573034e-05, "loss": 0.0596, "step": 222 }, { "epoch": 0.16729182295573894, "grad_norm": 0.12913598120212555, "learning_rate": 8.352059925093633e-05, "loss": 0.069, "step": 223 }, { "epoch": 0.16804201050262565, "grad_norm": 0.13187310099601746, "learning_rate": 8.389513108614233e-05, "loss": 0.0647, "step": 224 }, { "epoch": 0.16879219804951237, "grad_norm": 0.15238381922245026, "learning_rate": 8.426966292134831e-05, "loss": 0.0566, "step": 225 }, { "epoch": 0.1695423855963991, "grad_norm": 0.32936590909957886, "learning_rate": 8.46441947565543e-05, "loss": 0.0842, "step": 226 }, { "epoch": 0.17029257314328583, "grad_norm": 0.16082966327667236, "learning_rate": 8.50187265917603e-05, "loss": 0.0703, "step": 227 }, { "epoch": 0.17104276069017255, "grad_norm": 0.15576517581939697, "learning_rate": 8.53932584269663e-05, "loss": 0.0776, "step": 228 }, { "epoch": 0.17179294823705926, "grad_norm": 0.14493928849697113, "learning_rate": 8.576779026217229e-05, "loss": 0.0522, "step": 229 }, { "epoch": 0.17254313578394598, "grad_norm": 0.14679628610610962, "learning_rate": 8.614232209737829e-05, "loss": 0.0743, "step": 230 }, { "epoch": 0.17329332333083272, "grad_norm": 0.19002974033355713, "learning_rate": 8.651685393258427e-05, "loss": 0.0923, "step": 231 }, { "epoch": 0.17404351087771944, "grad_norm": 0.284746915102005, "learning_rate": 8.689138576779026e-05, "loss": 0.1601, "step": 232 }, { "epoch": 0.17479369842460615, "grad_norm": 0.299917995929718, "learning_rate": 8.726591760299626e-05, "loss": 0.0805, "step": 233 }, { "epoch": 0.17554388597149287, "grad_norm": 0.18388903141021729, "learning_rate": 8.764044943820225e-05, "loss": 0.0675, "step": 234 }, { "epoch": 0.17629407351837958, "grad_norm": 0.18255142867565155, "learning_rate": 8.801498127340825e-05, "loss": 0.0948, "step": 235 }, { "epoch": 0.17704426106526633, "grad_norm": 0.14342963695526123, "learning_rate": 8.838951310861424e-05, "loss": 0.0717, "step": 236 }, { "epoch": 0.17779444861215304, "grad_norm": 0.22409477829933167, "learning_rate": 8.876404494382022e-05, "loss": 0.1043, "step": 237 }, { "epoch": 0.17854463615903976, "grad_norm": 0.13333116471767426, "learning_rate": 8.913857677902622e-05, "loss": 0.0661, "step": 238 }, { "epoch": 0.17929482370592648, "grad_norm": 0.18588890135288239, "learning_rate": 8.951310861423221e-05, "loss": 0.0722, "step": 239 }, { "epoch": 0.1800450112528132, "grad_norm": 0.13051137328147888, "learning_rate": 8.988764044943821e-05, "loss": 0.0554, "step": 240 }, { "epoch": 0.18079519879969994, "grad_norm": 0.18558816611766815, "learning_rate": 9.02621722846442e-05, "loss": 0.0782, "step": 241 }, { "epoch": 0.18154538634658665, "grad_norm": 0.16871553659439087, "learning_rate": 9.063670411985018e-05, "loss": 0.0699, "step": 242 }, { "epoch": 0.18229557389347337, "grad_norm": 0.19408437609672546, "learning_rate": 9.101123595505618e-05, "loss": 0.0937, "step": 243 }, { "epoch": 0.18304576144036008, "grad_norm": 0.19955557584762573, "learning_rate": 9.138576779026217e-05, "loss": 0.0844, "step": 244 }, { "epoch": 0.1837959489872468, "grad_norm": 0.1998845785856247, "learning_rate": 9.176029962546817e-05, "loss": 0.0749, "step": 245 }, { "epoch": 0.18454613653413354, "grad_norm": 0.1685137152671814, "learning_rate": 9.213483146067416e-05, "loss": 0.0875, "step": 246 }, { "epoch": 0.18529632408102026, "grad_norm": 0.13906221091747284, "learning_rate": 9.250936329588016e-05, "loss": 0.0604, "step": 247 }, { "epoch": 0.18604651162790697, "grad_norm": 0.11521290242671967, "learning_rate": 9.288389513108615e-05, "loss": 0.0438, "step": 248 }, { "epoch": 0.1867966991747937, "grad_norm": 0.15000270307064056, "learning_rate": 9.325842696629214e-05, "loss": 0.046, "step": 249 }, { "epoch": 0.18754688672168043, "grad_norm": 0.10028818994760513, "learning_rate": 9.363295880149813e-05, "loss": 0.0526, "step": 250 }, { "epoch": 0.18829707426856715, "grad_norm": 0.15884524583816528, "learning_rate": 9.400749063670413e-05, "loss": 0.0537, "step": 251 }, { "epoch": 0.18904726181545387, "grad_norm": 0.1237359270453453, "learning_rate": 9.438202247191012e-05, "loss": 0.0507, "step": 252 }, { "epoch": 0.18979744936234058, "grad_norm": 0.1722472906112671, "learning_rate": 9.47565543071161e-05, "loss": 0.0709, "step": 253 }, { "epoch": 0.1905476369092273, "grad_norm": 0.1201101616024971, "learning_rate": 9.51310861423221e-05, "loss": 0.0475, "step": 254 }, { "epoch": 0.19129782445611404, "grad_norm": 0.1595461666584015, "learning_rate": 9.550561797752809e-05, "loss": 0.0815, "step": 255 }, { "epoch": 0.19204801200300076, "grad_norm": 0.16221360862255096, "learning_rate": 9.588014981273409e-05, "loss": 0.0715, "step": 256 }, { "epoch": 0.19279819954988747, "grad_norm": 0.1589856594800949, "learning_rate": 9.625468164794008e-05, "loss": 0.0585, "step": 257 }, { "epoch": 0.1935483870967742, "grad_norm": 0.2617347538471222, "learning_rate": 9.662921348314608e-05, "loss": 0.0698, "step": 258 }, { "epoch": 0.1942985746436609, "grad_norm": 0.15338610112667084, "learning_rate": 9.700374531835207e-05, "loss": 0.0502, "step": 259 }, { "epoch": 0.19504876219054765, "grad_norm": 0.18816447257995605, "learning_rate": 9.737827715355807e-05, "loss": 0.0843, "step": 260 }, { "epoch": 0.19579894973743436, "grad_norm": 0.1613040417432785, "learning_rate": 9.775280898876405e-05, "loss": 0.0639, "step": 261 }, { "epoch": 0.19654913728432108, "grad_norm": 0.18618257343769073, "learning_rate": 9.812734082397004e-05, "loss": 0.0842, "step": 262 }, { "epoch": 0.1972993248312078, "grad_norm": 0.17338407039642334, "learning_rate": 9.850187265917602e-05, "loss": 0.0565, "step": 263 }, { "epoch": 0.1980495123780945, "grad_norm": 0.4943162798881531, "learning_rate": 9.887640449438202e-05, "loss": 0.1323, "step": 264 }, { "epoch": 0.19879969992498125, "grad_norm": 0.18759749829769135, "learning_rate": 9.925093632958801e-05, "loss": 0.1042, "step": 265 }, { "epoch": 0.19954988747186797, "grad_norm": 0.12457593530416489, "learning_rate": 9.962546816479401e-05, "loss": 0.0541, "step": 266 }, { "epoch": 0.2003000750187547, "grad_norm": 0.17804086208343506, "learning_rate": 0.0001, "loss": 0.071, "step": 267 }, { "epoch": 0.2010502625656414, "grad_norm": 0.10970258712768555, "learning_rate": 9.99999571274618e-05, "loss": 0.0418, "step": 268 }, { "epoch": 0.20180045011252815, "grad_norm": 0.11806442588567734, "learning_rate": 9.999982850992069e-05, "loss": 0.052, "step": 269 }, { "epoch": 0.20255063765941486, "grad_norm": 0.1669778674840927, "learning_rate": 9.999961414759727e-05, "loss": 0.0855, "step": 270 }, { "epoch": 0.20330082520630158, "grad_norm": 0.10167306661605835, "learning_rate": 9.999931404085912e-05, "loss": 0.0363, "step": 271 }, { "epoch": 0.2040510127531883, "grad_norm": 0.15393097698688507, "learning_rate": 9.999892819022092e-05, "loss": 0.0794, "step": 272 }, { "epoch": 0.204801200300075, "grad_norm": 0.12911133468151093, "learning_rate": 9.999845659634435e-05, "loss": 0.0613, "step": 273 }, { "epoch": 0.20555138784696175, "grad_norm": 0.17678648233413696, "learning_rate": 9.999789926003814e-05, "loss": 0.075, "step": 274 }, { "epoch": 0.20630157539384847, "grad_norm": 0.09235060214996338, "learning_rate": 9.999725618225808e-05, "loss": 0.0682, "step": 275 }, { "epoch": 0.20705176294073518, "grad_norm": 0.14196725189685822, "learning_rate": 9.999652736410698e-05, "loss": 0.0545, "step": 276 }, { "epoch": 0.2078019504876219, "grad_norm": 0.15323829650878906, "learning_rate": 9.999571280683468e-05, "loss": 0.0685, "step": 277 }, { "epoch": 0.20855213803450862, "grad_norm": 0.15211105346679688, "learning_rate": 9.99948125118381e-05, "loss": 0.074, "step": 278 }, { "epoch": 0.20930232558139536, "grad_norm": 0.12618400156497955, "learning_rate": 9.999382648066113e-05, "loss": 0.0621, "step": 279 }, { "epoch": 0.21005251312828208, "grad_norm": 0.16087733209133148, "learning_rate": 9.999275471499472e-05, "loss": 0.0999, "step": 280 }, { "epoch": 0.2108027006751688, "grad_norm": 0.1869620382785797, "learning_rate": 9.999159721667685e-05, "loss": 0.0839, "step": 281 }, { "epoch": 0.2115528882220555, "grad_norm": 0.15936337411403656, "learning_rate": 9.999035398769252e-05, "loss": 0.07, "step": 282 }, { "epoch": 0.21230307576894222, "grad_norm": 0.19384759664535522, "learning_rate": 9.998902503017372e-05, "loss": 0.1308, "step": 283 }, { "epoch": 0.21305326331582897, "grad_norm": 0.29375916719436646, "learning_rate": 9.99876103463995e-05, "loss": 0.0701, "step": 284 }, { "epoch": 0.21380345086271568, "grad_norm": 0.21341568231582642, "learning_rate": 9.998610993879589e-05, "loss": 0.0474, "step": 285 }, { "epoch": 0.2145536384096024, "grad_norm": 0.25241464376449585, "learning_rate": 9.998452380993597e-05, "loss": 0.068, "step": 286 }, { "epoch": 0.21530382595648911, "grad_norm": 0.18199339509010315, "learning_rate": 9.998285196253977e-05, "loss": 0.0693, "step": 287 }, { "epoch": 0.21605401350337583, "grad_norm": 0.11006759852170944, "learning_rate": 9.998109439947434e-05, "loss": 0.0392, "step": 288 }, { "epoch": 0.21680420105026257, "grad_norm": 0.132887065410614, "learning_rate": 9.997925112375375e-05, "loss": 0.0552, "step": 289 }, { "epoch": 0.2175543885971493, "grad_norm": 0.21448518335819244, "learning_rate": 9.997732213853902e-05, "loss": 0.0984, "step": 290 }, { "epoch": 0.218304576144036, "grad_norm": 0.18582187592983246, "learning_rate": 9.997530744713817e-05, "loss": 0.0898, "step": 291 }, { "epoch": 0.21905476369092272, "grad_norm": 0.2004450112581253, "learning_rate": 9.997320705300621e-05, "loss": 0.1014, "step": 292 }, { "epoch": 0.21980495123780946, "grad_norm": 0.15326693654060364, "learning_rate": 9.997102095974508e-05, "loss": 0.048, "step": 293 }, { "epoch": 0.22055513878469618, "grad_norm": 0.13675133883953094, "learning_rate": 9.996874917110378e-05, "loss": 0.0534, "step": 294 }, { "epoch": 0.2213053263315829, "grad_norm": 0.1403188854455948, "learning_rate": 9.996639169097811e-05, "loss": 0.0563, "step": 295 }, { "epoch": 0.2220555138784696, "grad_norm": 0.263437956571579, "learning_rate": 9.996394852341098e-05, "loss": 0.0865, "step": 296 }, { "epoch": 0.22280570142535633, "grad_norm": 0.17431458830833435, "learning_rate": 9.996141967259218e-05, "loss": 0.0582, "step": 297 }, { "epoch": 0.22355588897224307, "grad_norm": 0.19193384051322937, "learning_rate": 9.995880514285841e-05, "loss": 0.076, "step": 298 }, { "epoch": 0.2243060765191298, "grad_norm": 0.12529249489307404, "learning_rate": 9.995610493869336e-05, "loss": 0.074, "step": 299 }, { "epoch": 0.2250562640660165, "grad_norm": 0.20957054197788239, "learning_rate": 9.99533190647276e-05, "loss": 0.0807, "step": 300 }, { "epoch": 0.22580645161290322, "grad_norm": 0.14158689975738525, "learning_rate": 9.995044752573864e-05, "loss": 0.0549, "step": 301 }, { "epoch": 0.22655663915978994, "grad_norm": 0.09992564469575882, "learning_rate": 9.994749032665085e-05, "loss": 0.0511, "step": 302 }, { "epoch": 0.22730682670667668, "grad_norm": 0.21247920393943787, "learning_rate": 9.994444747253559e-05, "loss": 0.0645, "step": 303 }, { "epoch": 0.2280570142535634, "grad_norm": 0.10664507746696472, "learning_rate": 9.9941318968611e-05, "loss": 0.0357, "step": 304 }, { "epoch": 0.2288072018004501, "grad_norm": 0.19751517474651337, "learning_rate": 9.993810482024221e-05, "loss": 0.0761, "step": 305 }, { "epoch": 0.22955738934733683, "grad_norm": 0.13949774205684662, "learning_rate": 9.993480503294114e-05, "loss": 0.059, "step": 306 }, { "epoch": 0.23030757689422354, "grad_norm": 0.1572161316871643, "learning_rate": 9.993141961236661e-05, "loss": 0.0523, "step": 307 }, { "epoch": 0.23105776444111029, "grad_norm": 0.1472707837820053, "learning_rate": 9.992794856432426e-05, "loss": 0.0559, "step": 308 }, { "epoch": 0.231807951987997, "grad_norm": 0.2457486093044281, "learning_rate": 9.992439189476661e-05, "loss": 0.0948, "step": 309 }, { "epoch": 0.23255813953488372, "grad_norm": 0.12344438582658768, "learning_rate": 9.992074960979301e-05, "loss": 0.0447, "step": 310 }, { "epoch": 0.23330832708177043, "grad_norm": 0.17082221806049347, "learning_rate": 9.991702171564961e-05, "loss": 0.086, "step": 311 }, { "epoch": 0.23405851462865718, "grad_norm": 0.1637532263994217, "learning_rate": 9.991320821872939e-05, "loss": 0.0732, "step": 312 }, { "epoch": 0.2348087021755439, "grad_norm": 0.18615975975990295, "learning_rate": 9.990930912557209e-05, "loss": 0.075, "step": 313 }, { "epoch": 0.2355588897224306, "grad_norm": 0.10900649428367615, "learning_rate": 9.990532444286431e-05, "loss": 0.045, "step": 314 }, { "epoch": 0.23630907726931732, "grad_norm": 0.1474543958902359, "learning_rate": 9.990125417743937e-05, "loss": 0.0548, "step": 315 }, { "epoch": 0.23705926481620404, "grad_norm": 0.15200990438461304, "learning_rate": 9.989709833627736e-05, "loss": 0.0674, "step": 316 }, { "epoch": 0.23780945236309078, "grad_norm": 0.2598418891429901, "learning_rate": 9.989285692650518e-05, "loss": 0.1139, "step": 317 }, { "epoch": 0.2385596399099775, "grad_norm": 0.21949024498462677, "learning_rate": 9.98885299553964e-05, "loss": 0.0969, "step": 318 }, { "epoch": 0.23930982745686422, "grad_norm": 0.11348603665828705, "learning_rate": 9.988411743037134e-05, "loss": 0.0454, "step": 319 }, { "epoch": 0.24006001500375093, "grad_norm": 0.18390147387981415, "learning_rate": 9.987961935899706e-05, "loss": 0.0623, "step": 320 }, { "epoch": 0.24081020255063765, "grad_norm": 0.25778982043266296, "learning_rate": 9.987503574898731e-05, "loss": 0.0796, "step": 321 }, { "epoch": 0.2415603900975244, "grad_norm": 0.1339096575975418, "learning_rate": 9.987036660820255e-05, "loss": 0.0488, "step": 322 }, { "epoch": 0.2423105776444111, "grad_norm": 0.18395960330963135, "learning_rate": 9.986561194464985e-05, "loss": 0.0802, "step": 323 }, { "epoch": 0.24306076519129782, "grad_norm": 0.13696521520614624, "learning_rate": 9.986077176648303e-05, "loss": 0.0595, "step": 324 }, { "epoch": 0.24381095273818454, "grad_norm": 0.1406623274087906, "learning_rate": 9.985584608200251e-05, "loss": 0.0619, "step": 325 }, { "epoch": 0.24456114028507125, "grad_norm": 0.09625441581010818, "learning_rate": 9.985083489965534e-05, "loss": 0.0424, "step": 326 }, { "epoch": 0.245311327831958, "grad_norm": 0.12339973449707031, "learning_rate": 9.984573822803521e-05, "loss": 0.045, "step": 327 }, { "epoch": 0.24606151537884471, "grad_norm": 0.15541982650756836, "learning_rate": 9.984055607588242e-05, "loss": 0.0655, "step": 328 }, { "epoch": 0.24681170292573143, "grad_norm": 0.13536489009857178, "learning_rate": 9.983528845208384e-05, "loss": 0.0514, "step": 329 }, { "epoch": 0.24756189047261815, "grad_norm": 0.18628405034542084, "learning_rate": 9.982993536567293e-05, "loss": 0.0793, "step": 330 }, { "epoch": 0.2483120780195049, "grad_norm": 0.10619158297777176, "learning_rate": 9.98244968258297e-05, "loss": 0.0379, "step": 331 }, { "epoch": 0.2490622655663916, "grad_norm": 0.21251732110977173, "learning_rate": 9.981897284188073e-05, "loss": 0.1043, "step": 332 }, { "epoch": 0.24981245311327832, "grad_norm": 0.18133696913719177, "learning_rate": 9.981336342329909e-05, "loss": 0.0441, "step": 333 }, { "epoch": 0.25056264066016504, "grad_norm": 0.15581296384334564, "learning_rate": 9.980766857970438e-05, "loss": 0.0647, "step": 334 }, { "epoch": 0.25131282820705175, "grad_norm": 0.24356499314308167, "learning_rate": 9.98018883208627e-05, "loss": 0.073, "step": 335 }, { "epoch": 0.25206301575393847, "grad_norm": 0.10127891600131989, "learning_rate": 9.979602265668664e-05, "loss": 0.0461, "step": 336 }, { "epoch": 0.2528132033008252, "grad_norm": 0.13798540830612183, "learning_rate": 9.979007159723521e-05, "loss": 0.0595, "step": 337 }, { "epoch": 0.2535633908477119, "grad_norm": 0.13023167848587036, "learning_rate": 9.97840351527139e-05, "loss": 0.0604, "step": 338 }, { "epoch": 0.25431357839459867, "grad_norm": 0.12885215878486633, "learning_rate": 9.977791333347462e-05, "loss": 0.0564, "step": 339 }, { "epoch": 0.2550637659414854, "grad_norm": 0.23486711084842682, "learning_rate": 9.97717061500157e-05, "loss": 0.0684, "step": 340 }, { "epoch": 0.2558139534883721, "grad_norm": 0.09653028845787048, "learning_rate": 9.976541361298184e-05, "loss": 0.0477, "step": 341 }, { "epoch": 0.2565641410352588, "grad_norm": 0.18098174035549164, "learning_rate": 9.97590357331641e-05, "loss": 0.0573, "step": 342 }, { "epoch": 0.25731432858214554, "grad_norm": 0.1684122532606125, "learning_rate": 9.975257252149994e-05, "loss": 0.0499, "step": 343 }, { "epoch": 0.25806451612903225, "grad_norm": 0.2163161337375641, "learning_rate": 9.974602398907313e-05, "loss": 0.0719, "step": 344 }, { "epoch": 0.25881470367591897, "grad_norm": 0.1903722733259201, "learning_rate": 9.973939014711375e-05, "loss": 0.0758, "step": 345 }, { "epoch": 0.2595648912228057, "grad_norm": 0.2286502569913864, "learning_rate": 9.973267100699819e-05, "loss": 0.0647, "step": 346 }, { "epoch": 0.2603150787696924, "grad_norm": 0.20068782567977905, "learning_rate": 9.972586658024911e-05, "loss": 0.0849, "step": 347 }, { "epoch": 0.26106526631657917, "grad_norm": 0.17922401428222656, "learning_rate": 9.971897687853544e-05, "loss": 0.0526, "step": 348 }, { "epoch": 0.2618154538634659, "grad_norm": 0.301491379737854, "learning_rate": 9.971200191367234e-05, "loss": 0.0765, "step": 349 }, { "epoch": 0.2625656414103526, "grad_norm": 0.21195389330387115, "learning_rate": 9.970494169762117e-05, "loss": 0.0759, "step": 350 }, { "epoch": 0.2633158289572393, "grad_norm": 0.28137511014938354, "learning_rate": 9.969779624248954e-05, "loss": 0.0747, "step": 351 }, { "epoch": 0.26406601650412603, "grad_norm": 0.22566178441047668, "learning_rate": 9.969056556053116e-05, "loss": 0.1097, "step": 352 }, { "epoch": 0.26481620405101275, "grad_norm": 0.16110064089298248, "learning_rate": 9.968324966414597e-05, "loss": 0.0914, "step": 353 }, { "epoch": 0.26556639159789946, "grad_norm": 0.2024664431810379, "learning_rate": 9.967584856588e-05, "loss": 0.1056, "step": 354 }, { "epoch": 0.2663165791447862, "grad_norm": 0.21437528729438782, "learning_rate": 9.966836227842538e-05, "loss": 0.0681, "step": 355 }, { "epoch": 0.2670667666916729, "grad_norm": 0.14517563581466675, "learning_rate": 9.96607908146204e-05, "loss": 0.0538, "step": 356 }, { "epoch": 0.2678169542385596, "grad_norm": 0.1344057023525238, "learning_rate": 9.965313418744935e-05, "loss": 0.0672, "step": 357 }, { "epoch": 0.2685671417854464, "grad_norm": 0.08291507512331009, "learning_rate": 9.964539241004261e-05, "loss": 0.0373, "step": 358 }, { "epoch": 0.2693173293323331, "grad_norm": 0.3290295898914337, "learning_rate": 9.963756549567654e-05, "loss": 0.0939, "step": 359 }, { "epoch": 0.2700675168792198, "grad_norm": 0.20721577107906342, "learning_rate": 9.962965345777353e-05, "loss": 0.083, "step": 360 }, { "epoch": 0.27081770442610653, "grad_norm": 0.15423627197742462, "learning_rate": 9.962165630990196e-05, "loss": 0.0844, "step": 361 }, { "epoch": 0.27156789197299325, "grad_norm": 0.1754678636789322, "learning_rate": 9.961357406577617e-05, "loss": 0.0747, "step": 362 }, { "epoch": 0.27231807951987996, "grad_norm": 0.10178562998771667, "learning_rate": 9.960540673925636e-05, "loss": 0.0446, "step": 363 }, { "epoch": 0.2730682670667667, "grad_norm": 0.17141780257225037, "learning_rate": 9.959715434434873e-05, "loss": 0.0759, "step": 364 }, { "epoch": 0.2738184546136534, "grad_norm": 0.16186627745628357, "learning_rate": 9.958881689520531e-05, "loss": 0.0611, "step": 365 }, { "epoch": 0.2745686421605401, "grad_norm": 0.15801844000816345, "learning_rate": 9.958039440612402e-05, "loss": 0.0882, "step": 366 }, { "epoch": 0.2753188297074269, "grad_norm": 0.18964071571826935, "learning_rate": 9.957188689154859e-05, "loss": 0.063, "step": 367 }, { "epoch": 0.2760690172543136, "grad_norm": 0.13521847128868103, "learning_rate": 9.956329436606857e-05, "loss": 0.0784, "step": 368 }, { "epoch": 0.2768192048012003, "grad_norm": 0.1345323920249939, "learning_rate": 9.955461684441928e-05, "loss": 0.0655, "step": 369 }, { "epoch": 0.27756939234808703, "grad_norm": 0.2111639380455017, "learning_rate": 9.954585434148183e-05, "loss": 0.0848, "step": 370 }, { "epoch": 0.27831957989497375, "grad_norm": 0.14540591835975647, "learning_rate": 9.953700687228306e-05, "loss": 0.0664, "step": 371 }, { "epoch": 0.27906976744186046, "grad_norm": 0.13060738146305084, "learning_rate": 9.952807445199549e-05, "loss": 0.0599, "step": 372 }, { "epoch": 0.2798199549887472, "grad_norm": 0.12008094042539597, "learning_rate": 9.951905709593735e-05, "loss": 0.0573, "step": 373 }, { "epoch": 0.2805701425356339, "grad_norm": 0.1303444355726242, "learning_rate": 9.950995481957251e-05, "loss": 0.0783, "step": 374 }, { "epoch": 0.2813203300825206, "grad_norm": 0.12107301503419876, "learning_rate": 9.950076763851049e-05, "loss": 0.0585, "step": 375 }, { "epoch": 0.2820705176294073, "grad_norm": 0.11897249519824982, "learning_rate": 9.949149556850638e-05, "loss": 0.0649, "step": 376 }, { "epoch": 0.2828207051762941, "grad_norm": 0.16101312637329102, "learning_rate": 9.94821386254609e-05, "loss": 0.0831, "step": 377 }, { "epoch": 0.2835708927231808, "grad_norm": 0.16048072278499603, "learning_rate": 9.947269682542027e-05, "loss": 0.067, "step": 378 }, { "epoch": 0.2843210802700675, "grad_norm": 0.12512798607349396, "learning_rate": 9.946317018457622e-05, "loss": 0.0535, "step": 379 }, { "epoch": 0.28507126781695424, "grad_norm": 0.12598682940006256, "learning_rate": 9.945355871926605e-05, "loss": 0.0722, "step": 380 }, { "epoch": 0.28582145536384096, "grad_norm": 0.2111731618642807, "learning_rate": 9.944386244597244e-05, "loss": 0.0828, "step": 381 }, { "epoch": 0.2865716429107277, "grad_norm": 0.12089638411998749, "learning_rate": 9.943408138132357e-05, "loss": 0.0621, "step": 382 }, { "epoch": 0.2873218304576144, "grad_norm": 0.0753532275557518, "learning_rate": 9.942421554209297e-05, "loss": 0.0561, "step": 383 }, { "epoch": 0.2880720180045011, "grad_norm": 0.1740962713956833, "learning_rate": 9.94142649451996e-05, "loss": 0.0756, "step": 384 }, { "epoch": 0.2888222055513878, "grad_norm": 0.1652483493089676, "learning_rate": 9.940422960770776e-05, "loss": 0.0824, "step": 385 }, { "epoch": 0.2895723930982746, "grad_norm": 0.14878934621810913, "learning_rate": 9.939410954682706e-05, "loss": 0.0437, "step": 386 }, { "epoch": 0.2903225806451613, "grad_norm": 0.12823975086212158, "learning_rate": 9.938390477991242e-05, "loss": 0.0616, "step": 387 }, { "epoch": 0.291072768192048, "grad_norm": 0.11716058105230331, "learning_rate": 9.937361532446399e-05, "loss": 0.0538, "step": 388 }, { "epoch": 0.29182295573893474, "grad_norm": 0.1182708591222763, "learning_rate": 9.936324119812719e-05, "loss": 0.0578, "step": 389 }, { "epoch": 0.29257314328582146, "grad_norm": 0.16185607016086578, "learning_rate": 9.93527824186926e-05, "loss": 0.0629, "step": 390 }, { "epoch": 0.2933233308327082, "grad_norm": 0.16407489776611328, "learning_rate": 9.934223900409603e-05, "loss": 0.0475, "step": 391 }, { "epoch": 0.2940735183795949, "grad_norm": 0.1613144725561142, "learning_rate": 9.933161097241837e-05, "loss": 0.0459, "step": 392 }, { "epoch": 0.2948237059264816, "grad_norm": 0.23988468945026398, "learning_rate": 9.932089834188567e-05, "loss": 0.0712, "step": 393 }, { "epoch": 0.2955738934733683, "grad_norm": 0.15247158706188202, "learning_rate": 9.931010113086902e-05, "loss": 0.0633, "step": 394 }, { "epoch": 0.29632408102025504, "grad_norm": 0.19023269414901733, "learning_rate": 9.929921935788457e-05, "loss": 0.0715, "step": 395 }, { "epoch": 0.2970742685671418, "grad_norm": 0.15984871983528137, "learning_rate": 9.928825304159351e-05, "loss": 0.0639, "step": 396 }, { "epoch": 0.2978244561140285, "grad_norm": 0.1645509898662567, "learning_rate": 9.927720220080199e-05, "loss": 0.0586, "step": 397 }, { "epoch": 0.29857464366091524, "grad_norm": 0.1377629190683365, "learning_rate": 9.926606685446109e-05, "loss": 0.0764, "step": 398 }, { "epoch": 0.29932483120780196, "grad_norm": 0.11947868764400482, "learning_rate": 9.925484702166686e-05, "loss": 0.0512, "step": 399 }, { "epoch": 0.30007501875468867, "grad_norm": 0.10650676488876343, "learning_rate": 9.924354272166017e-05, "loss": 0.0361, "step": 400 }, { "epoch": 0.30007501875468867, "eval_loss": 0.07004823535680771, "eval_runtime": 2.655, "eval_samples_per_second": 20.339, "eval_steps_per_second": 5.273, "step": 400 }, { "epoch": 0.3008252063015754, "grad_norm": 0.13129869103431702, "learning_rate": 9.923215397382684e-05, "loss": 0.0503, "step": 401 }, { "epoch": 0.3015753938484621, "grad_norm": 0.11299598217010498, "learning_rate": 9.92206807976974e-05, "loss": 0.0361, "step": 402 }, { "epoch": 0.3023255813953488, "grad_norm": 0.21530146896839142, "learning_rate": 9.920912321294723e-05, "loss": 0.0583, "step": 403 }, { "epoch": 0.30307576894223553, "grad_norm": 0.10559721291065216, "learning_rate": 9.919748123939647e-05, "loss": 0.0494, "step": 404 }, { "epoch": 0.3038259564891223, "grad_norm": 0.22442474961280823, "learning_rate": 9.918575489700993e-05, "loss": 0.0808, "step": 405 }, { "epoch": 0.304576144036009, "grad_norm": 0.17437469959259033, "learning_rate": 9.917394420589716e-05, "loss": 0.0621, "step": 406 }, { "epoch": 0.30532633158289574, "grad_norm": 0.12458070367574692, "learning_rate": 9.916204918631231e-05, "loss": 0.0628, "step": 407 }, { "epoch": 0.30607651912978245, "grad_norm": 0.1025652214884758, "learning_rate": 9.915006985865416e-05, "loss": 0.0334, "step": 408 }, { "epoch": 0.30682670667666917, "grad_norm": 0.16310417652130127, "learning_rate": 9.913800624346612e-05, "loss": 0.0767, "step": 409 }, { "epoch": 0.3075768942235559, "grad_norm": 0.21571268141269684, "learning_rate": 9.912585836143606e-05, "loss": 0.0859, "step": 410 }, { "epoch": 0.3083270817704426, "grad_norm": 0.21639621257781982, "learning_rate": 9.911362623339642e-05, "loss": 0.0706, "step": 411 }, { "epoch": 0.3090772693173293, "grad_norm": 0.1594085842370987, "learning_rate": 9.91013098803241e-05, "loss": 0.0711, "step": 412 }, { "epoch": 0.30982745686421603, "grad_norm": 0.15109415352344513, "learning_rate": 9.908890932334042e-05, "loss": 0.0635, "step": 413 }, { "epoch": 0.31057764441110275, "grad_norm": 0.14152328670024872, "learning_rate": 9.907642458371111e-05, "loss": 0.0692, "step": 414 }, { "epoch": 0.3113278319579895, "grad_norm": 0.17827863991260529, "learning_rate": 9.906385568284629e-05, "loss": 0.0763, "step": 415 }, { "epoch": 0.31207801950487624, "grad_norm": 0.16184374690055847, "learning_rate": 9.905120264230036e-05, "loss": 0.0665, "step": 416 }, { "epoch": 0.31282820705176295, "grad_norm": 0.12294892221689224, "learning_rate": 9.903846548377206e-05, "loss": 0.0732, "step": 417 }, { "epoch": 0.31357839459864967, "grad_norm": 0.21045026183128357, "learning_rate": 9.902564422910436e-05, "loss": 0.0966, "step": 418 }, { "epoch": 0.3143285821455364, "grad_norm": 0.1927548348903656, "learning_rate": 9.901273890028444e-05, "loss": 0.0635, "step": 419 }, { "epoch": 0.3150787696924231, "grad_norm": 0.11646755039691925, "learning_rate": 9.899974951944367e-05, "loss": 0.0516, "step": 420 }, { "epoch": 0.3158289572393098, "grad_norm": 0.13698947429656982, "learning_rate": 9.898667610885757e-05, "loss": 0.0626, "step": 421 }, { "epoch": 0.31657914478619653, "grad_norm": 0.13424015045166016, "learning_rate": 9.897351869094573e-05, "loss": 0.0571, "step": 422 }, { "epoch": 0.31732933233308325, "grad_norm": 0.20071502029895782, "learning_rate": 9.896027728827185e-05, "loss": 0.0925, "step": 423 }, { "epoch": 0.31807951987997, "grad_norm": 0.14641085267066956, "learning_rate": 9.894695192354362e-05, "loss": 0.055, "step": 424 }, { "epoch": 0.31882970742685673, "grad_norm": 0.23519590497016907, "learning_rate": 9.893354261961274e-05, "loss": 0.0935, "step": 425 }, { "epoch": 0.31957989497374345, "grad_norm": 0.14721961319446564, "learning_rate": 9.892004939947482e-05, "loss": 0.0588, "step": 426 }, { "epoch": 0.32033008252063017, "grad_norm": 0.19134773313999176, "learning_rate": 9.890647228626944e-05, "loss": 0.084, "step": 427 }, { "epoch": 0.3210802700675169, "grad_norm": 0.18689143657684326, "learning_rate": 9.889281130327997e-05, "loss": 0.0695, "step": 428 }, { "epoch": 0.3218304576144036, "grad_norm": 0.16758248209953308, "learning_rate": 9.887906647393368e-05, "loss": 0.0738, "step": 429 }, { "epoch": 0.3225806451612903, "grad_norm": 0.13592371344566345, "learning_rate": 9.88652378218016e-05, "loss": 0.0846, "step": 430 }, { "epoch": 0.32333083270817703, "grad_norm": 0.16219943761825562, "learning_rate": 9.885132537059849e-05, "loss": 0.0805, "step": 431 }, { "epoch": 0.32408102025506375, "grad_norm": 0.10020235925912857, "learning_rate": 9.883732914418285e-05, "loss": 0.0498, "step": 432 }, { "epoch": 0.32483120780195046, "grad_norm": 0.14994826912879944, "learning_rate": 9.882324916655681e-05, "loss": 0.0618, "step": 433 }, { "epoch": 0.32558139534883723, "grad_norm": 0.23991218209266663, "learning_rate": 9.880908546186616e-05, "loss": 0.091, "step": 434 }, { "epoch": 0.32633158289572395, "grad_norm": 0.1507575809955597, "learning_rate": 9.879483805440027e-05, "loss": 0.0716, "step": 435 }, { "epoch": 0.32708177044261066, "grad_norm": 0.09811144322156906, "learning_rate": 9.8780506968592e-05, "loss": 0.0379, "step": 436 }, { "epoch": 0.3278319579894974, "grad_norm": 0.10230189561843872, "learning_rate": 9.876609222901781e-05, "loss": 0.0409, "step": 437 }, { "epoch": 0.3285821455363841, "grad_norm": 0.22452911734580994, "learning_rate": 9.875159386039749e-05, "loss": 0.0938, "step": 438 }, { "epoch": 0.3293323330832708, "grad_norm": 0.19232669472694397, "learning_rate": 9.873701188759438e-05, "loss": 0.1031, "step": 439 }, { "epoch": 0.3300825206301575, "grad_norm": 0.14251667261123657, "learning_rate": 9.872234633561509e-05, "loss": 0.0674, "step": 440 }, { "epoch": 0.33083270817704424, "grad_norm": 0.14763318002223969, "learning_rate": 9.87075972296096e-05, "loss": 0.0603, "step": 441 }, { "epoch": 0.33158289572393096, "grad_norm": 0.1664535254240036, "learning_rate": 9.86927645948712e-05, "loss": 0.0775, "step": 442 }, { "epoch": 0.33233308327081773, "grad_norm": 0.1140127182006836, "learning_rate": 9.867784845683637e-05, "loss": 0.0771, "step": 443 }, { "epoch": 0.33308327081770445, "grad_norm": 0.1934347301721573, "learning_rate": 9.866284884108481e-05, "loss": 0.0796, "step": 444 }, { "epoch": 0.33383345836459116, "grad_norm": 0.2020241767168045, "learning_rate": 9.864776577333941e-05, "loss": 0.0763, "step": 445 }, { "epoch": 0.3345836459114779, "grad_norm": 0.10327481478452682, "learning_rate": 9.863259927946613e-05, "loss": 0.0496, "step": 446 }, { "epoch": 0.3353338334583646, "grad_norm": 0.1181400716304779, "learning_rate": 9.861734938547405e-05, "loss": 0.0535, "step": 447 }, { "epoch": 0.3360840210052513, "grad_norm": 0.1940775215625763, "learning_rate": 9.860201611751518e-05, "loss": 0.0844, "step": 448 }, { "epoch": 0.336834208552138, "grad_norm": 0.12212885916233063, "learning_rate": 9.858659950188458e-05, "loss": 0.0664, "step": 449 }, { "epoch": 0.33758439609902474, "grad_norm": 0.14932742714881897, "learning_rate": 9.857109956502027e-05, "loss": 0.0786, "step": 450 }, { "epoch": 0.33833458364591146, "grad_norm": 0.15180030465126038, "learning_rate": 9.855551633350306e-05, "loss": 0.098, "step": 451 }, { "epoch": 0.3390847711927982, "grad_norm": 0.14569677412509918, "learning_rate": 9.853984983405668e-05, "loss": 0.0781, "step": 452 }, { "epoch": 0.33983495873968494, "grad_norm": 0.14552511274814606, "learning_rate": 9.852410009354766e-05, "loss": 0.0504, "step": 453 }, { "epoch": 0.34058514628657166, "grad_norm": 0.16309498250484467, "learning_rate": 9.850826713898521e-05, "loss": 0.0645, "step": 454 }, { "epoch": 0.3413353338334584, "grad_norm": 0.11566208302974701, "learning_rate": 9.849235099752132e-05, "loss": 0.0705, "step": 455 }, { "epoch": 0.3420855213803451, "grad_norm": 0.14972847700119019, "learning_rate": 9.847635169645058e-05, "loss": 0.0682, "step": 456 }, { "epoch": 0.3428357089272318, "grad_norm": 0.21967746317386627, "learning_rate": 9.846026926321024e-05, "loss": 0.0742, "step": 457 }, { "epoch": 0.3435858964741185, "grad_norm": 0.13541746139526367, "learning_rate": 9.844410372538006e-05, "loss": 0.0622, "step": 458 }, { "epoch": 0.34433608402100524, "grad_norm": 0.14925634860992432, "learning_rate": 9.842785511068239e-05, "loss": 0.0722, "step": 459 }, { "epoch": 0.34508627156789196, "grad_norm": 0.10999463498592377, "learning_rate": 9.841152344698197e-05, "loss": 0.0425, "step": 460 }, { "epoch": 0.34583645911477867, "grad_norm": 0.1686522364616394, "learning_rate": 9.8395108762286e-05, "loss": 0.0602, "step": 461 }, { "epoch": 0.34658664666166544, "grad_norm": 0.1271689385175705, "learning_rate": 9.837861108474404e-05, "loss": 0.0473, "step": 462 }, { "epoch": 0.34733683420855216, "grad_norm": 0.12710584700107574, "learning_rate": 9.8362030442648e-05, "loss": 0.0352, "step": 463 }, { "epoch": 0.3480870217554389, "grad_norm": 0.18865905702114105, "learning_rate": 9.834536686443204e-05, "loss": 0.07, "step": 464 }, { "epoch": 0.3488372093023256, "grad_norm": 0.12701815366744995, "learning_rate": 9.832862037867257e-05, "loss": 0.0504, "step": 465 }, { "epoch": 0.3495873968492123, "grad_norm": 0.26698964834213257, "learning_rate": 9.831179101408813e-05, "loss": 0.0944, "step": 466 }, { "epoch": 0.350337584396099, "grad_norm": 0.1741272509098053, "learning_rate": 9.829487879953946e-05, "loss": 0.0799, "step": 467 }, { "epoch": 0.35108777194298574, "grad_norm": 0.1799943894147873, "learning_rate": 9.827788376402932e-05, "loss": 0.0862, "step": 468 }, { "epoch": 0.35183795948987245, "grad_norm": 0.14110024273395538, "learning_rate": 9.826080593670253e-05, "loss": 0.062, "step": 469 }, { "epoch": 0.35258814703675917, "grad_norm": 0.30671846866607666, "learning_rate": 9.82436453468459e-05, "loss": 0.0911, "step": 470 }, { "epoch": 0.3533383345836459, "grad_norm": 0.2468806803226471, "learning_rate": 9.822640202388812e-05, "loss": 0.0777, "step": 471 }, { "epoch": 0.35408852213053266, "grad_norm": 0.12997083365917206, "learning_rate": 9.820907599739979e-05, "loss": 0.0566, "step": 472 }, { "epoch": 0.3548387096774194, "grad_norm": 0.16618408262729645, "learning_rate": 9.819166729709336e-05, "loss": 0.068, "step": 473 }, { "epoch": 0.3555888972243061, "grad_norm": 0.18094521760940552, "learning_rate": 9.817417595282304e-05, "loss": 0.0681, "step": 474 }, { "epoch": 0.3563390847711928, "grad_norm": 0.13062360882759094, "learning_rate": 9.815660199458476e-05, "loss": 0.0533, "step": 475 }, { "epoch": 0.3570892723180795, "grad_norm": 0.17325717210769653, "learning_rate": 9.81389454525161e-05, "loss": 0.0761, "step": 476 }, { "epoch": 0.35783945986496624, "grad_norm": 0.14805567264556885, "learning_rate": 9.812120635689632e-05, "loss": 0.0792, "step": 477 }, { "epoch": 0.35858964741185295, "grad_norm": 0.22663648426532745, "learning_rate": 9.810338473814621e-05, "loss": 0.0912, "step": 478 }, { "epoch": 0.35933983495873967, "grad_norm": 0.18868513405323029, "learning_rate": 9.808548062682812e-05, "loss": 0.0645, "step": 479 }, { "epoch": 0.3600900225056264, "grad_norm": 0.16635006666183472, "learning_rate": 9.80674940536458e-05, "loss": 0.0642, "step": 480 }, { "epoch": 0.36084021005251316, "grad_norm": 0.13103297352790833, "learning_rate": 9.804942504944445e-05, "loss": 0.0624, "step": 481 }, { "epoch": 0.36159039759939987, "grad_norm": 0.1823355257511139, "learning_rate": 9.803127364521067e-05, "loss": 0.068, "step": 482 }, { "epoch": 0.3623405851462866, "grad_norm": 0.09847483783960342, "learning_rate": 9.801303987207229e-05, "loss": 0.0557, "step": 483 }, { "epoch": 0.3630907726931733, "grad_norm": 0.20062148571014404, "learning_rate": 9.799472376129846e-05, "loss": 0.0981, "step": 484 }, { "epoch": 0.36384096024006, "grad_norm": 0.1890101432800293, "learning_rate": 9.79763253442995e-05, "loss": 0.0817, "step": 485 }, { "epoch": 0.36459114778694673, "grad_norm": 0.2274111807346344, "learning_rate": 9.795784465262689e-05, "loss": 0.0758, "step": 486 }, { "epoch": 0.36534133533383345, "grad_norm": 0.30035200715065, "learning_rate": 9.79392817179732e-05, "loss": 0.0743, "step": 487 }, { "epoch": 0.36609152288072017, "grad_norm": 0.10150797665119171, "learning_rate": 9.792063657217201e-05, "loss": 0.0486, "step": 488 }, { "epoch": 0.3668417104276069, "grad_norm": 0.10696882009506226, "learning_rate": 9.790190924719793e-05, "loss": 0.0415, "step": 489 }, { "epoch": 0.3675918979744936, "grad_norm": 0.16352054476737976, "learning_rate": 9.788309977516648e-05, "loss": 0.0831, "step": 490 }, { "epoch": 0.36834208552138037, "grad_norm": 0.15026482939720154, "learning_rate": 9.786420818833404e-05, "loss": 0.0474, "step": 491 }, { "epoch": 0.3690922730682671, "grad_norm": 0.13316163420677185, "learning_rate": 9.784523451909782e-05, "loss": 0.0525, "step": 492 }, { "epoch": 0.3698424606151538, "grad_norm": 0.19105587899684906, "learning_rate": 9.78261787999958e-05, "loss": 0.1024, "step": 493 }, { "epoch": 0.3705926481620405, "grad_norm": 0.20892846584320068, "learning_rate": 9.780704106370667e-05, "loss": 0.08, "step": 494 }, { "epoch": 0.37134283570892723, "grad_norm": 0.1441580057144165, "learning_rate": 9.778782134304976e-05, "loss": 0.057, "step": 495 }, { "epoch": 0.37209302325581395, "grad_norm": 0.10130858421325684, "learning_rate": 9.776851967098499e-05, "loss": 0.0416, "step": 496 }, { "epoch": 0.37284321080270066, "grad_norm": 0.17521536350250244, "learning_rate": 9.774913608061282e-05, "loss": 0.0666, "step": 497 }, { "epoch": 0.3735933983495874, "grad_norm": 0.18551689386367798, "learning_rate": 9.772967060517421e-05, "loss": 0.0669, "step": 498 }, { "epoch": 0.3743435858964741, "grad_norm": 0.15010853111743927, "learning_rate": 9.771012327805055e-05, "loss": 0.0507, "step": 499 }, { "epoch": 0.37509377344336087, "grad_norm": 0.18854506313800812, "learning_rate": 9.769049413276355e-05, "loss": 0.0627, "step": 500 }, { "epoch": 0.3758439609902476, "grad_norm": 0.18690386414527893, "learning_rate": 9.767078320297528e-05, "loss": 0.0773, "step": 501 }, { "epoch": 0.3765941485371343, "grad_norm": 0.19477099180221558, "learning_rate": 9.765099052248805e-05, "loss": 0.0585, "step": 502 }, { "epoch": 0.377344336084021, "grad_norm": 0.2110205888748169, "learning_rate": 9.763111612524434e-05, "loss": 0.065, "step": 503 }, { "epoch": 0.37809452363090773, "grad_norm": 0.11988997459411621, "learning_rate": 9.761116004532679e-05, "loss": 0.0523, "step": 504 }, { "epoch": 0.37884471117779445, "grad_norm": 0.1722867339849472, "learning_rate": 9.759112231695811e-05, "loss": 0.0819, "step": 505 }, { "epoch": 0.37959489872468116, "grad_norm": 0.19149507582187653, "learning_rate": 9.757100297450103e-05, "loss": 0.1036, "step": 506 }, { "epoch": 0.3803450862715679, "grad_norm": 0.14034023880958557, "learning_rate": 9.755080205245826e-05, "loss": 0.0473, "step": 507 }, { "epoch": 0.3810952738184546, "grad_norm": 0.18162596225738525, "learning_rate": 9.753051958547238e-05, "loss": 0.0597, "step": 508 }, { "epoch": 0.3818454613653413, "grad_norm": 0.2020900994539261, "learning_rate": 9.751015560832582e-05, "loss": 0.0953, "step": 509 }, { "epoch": 0.3825956489122281, "grad_norm": 0.16374067962169647, "learning_rate": 9.748971015594078e-05, "loss": 0.0602, "step": 510 }, { "epoch": 0.3833458364591148, "grad_norm": 0.15971016883850098, "learning_rate": 9.746918326337923e-05, "loss": 0.0506, "step": 511 }, { "epoch": 0.3840960240060015, "grad_norm": 0.20230558514595032, "learning_rate": 9.744857496584274e-05, "loss": 0.1119, "step": 512 }, { "epoch": 0.38484621155288823, "grad_norm": 0.18030567467212677, "learning_rate": 9.742788529867255e-05, "loss": 0.0736, "step": 513 }, { "epoch": 0.38559639909977494, "grad_norm": 0.12409501522779465, "learning_rate": 9.740711429734936e-05, "loss": 0.0677, "step": 514 }, { "epoch": 0.38634658664666166, "grad_norm": 0.19922702014446259, "learning_rate": 9.738626199749341e-05, "loss": 0.0703, "step": 515 }, { "epoch": 0.3870967741935484, "grad_norm": 0.22303518652915955, "learning_rate": 9.736532843486433e-05, "loss": 0.0985, "step": 516 }, { "epoch": 0.3878469617404351, "grad_norm": 0.259988397359848, "learning_rate": 9.734431364536114e-05, "loss": 0.0886, "step": 517 }, { "epoch": 0.3885971492873218, "grad_norm": 0.17871250212192535, "learning_rate": 9.732321766502213e-05, "loss": 0.0929, "step": 518 }, { "epoch": 0.3893473368342086, "grad_norm": 0.06613419950008392, "learning_rate": 9.730204053002481e-05, "loss": 0.0376, "step": 519 }, { "epoch": 0.3900975243810953, "grad_norm": 0.14904949069023132, "learning_rate": 9.728078227668588e-05, "loss": 0.0616, "step": 520 }, { "epoch": 0.390847711927982, "grad_norm": 0.15744099020957947, "learning_rate": 9.725944294146119e-05, "loss": 0.0696, "step": 521 }, { "epoch": 0.3915978994748687, "grad_norm": 0.19264377653598785, "learning_rate": 9.723802256094555e-05, "loss": 0.0765, "step": 522 }, { "epoch": 0.39234808702175544, "grad_norm": 0.17883890867233276, "learning_rate": 9.721652117187283e-05, "loss": 0.0666, "step": 523 }, { "epoch": 0.39309827456864216, "grad_norm": 0.1510065793991089, "learning_rate": 9.71949388111158e-05, "loss": 0.0796, "step": 524 }, { "epoch": 0.3938484621155289, "grad_norm": 0.16549058258533478, "learning_rate": 9.717327551568608e-05, "loss": 0.0761, "step": 525 }, { "epoch": 0.3945986496624156, "grad_norm": 0.10932381451129913, "learning_rate": 9.715153132273407e-05, "loss": 0.058, "step": 526 }, { "epoch": 0.3953488372093023, "grad_norm": 0.15327946841716766, "learning_rate": 9.712970626954893e-05, "loss": 0.0592, "step": 527 }, { "epoch": 0.396099024756189, "grad_norm": 0.1345522105693817, "learning_rate": 9.71078003935585e-05, "loss": 0.0752, "step": 528 }, { "epoch": 0.3968492123030758, "grad_norm": 0.10842501372098923, "learning_rate": 9.708581373232917e-05, "loss": 0.0607, "step": 529 }, { "epoch": 0.3975993998499625, "grad_norm": 0.15361064672470093, "learning_rate": 9.70637463235659e-05, "loss": 0.0648, "step": 530 }, { "epoch": 0.3983495873968492, "grad_norm": 0.13886478543281555, "learning_rate": 9.704159820511214e-05, "loss": 0.0888, "step": 531 }, { "epoch": 0.39909977494373594, "grad_norm": 0.15999573469161987, "learning_rate": 9.701936941494971e-05, "loss": 0.0651, "step": 532 }, { "epoch": 0.39984996249062266, "grad_norm": 0.23427529633045197, "learning_rate": 9.699705999119882e-05, "loss": 0.0672, "step": 533 }, { "epoch": 0.4006001500375094, "grad_norm": 0.10031108558177948, "learning_rate": 9.697466997211793e-05, "loss": 0.0414, "step": 534 }, { "epoch": 0.4013503375843961, "grad_norm": 0.1497843712568283, "learning_rate": 9.69521993961037e-05, "loss": 0.0784, "step": 535 }, { "epoch": 0.4021005251312828, "grad_norm": 0.14574141800403595, "learning_rate": 9.692964830169098e-05, "loss": 0.0683, "step": 536 }, { "epoch": 0.4028507126781695, "grad_norm": 0.12160151451826096, "learning_rate": 9.690701672755266e-05, "loss": 0.0692, "step": 537 }, { "epoch": 0.4036009002250563, "grad_norm": 0.15294575691223145, "learning_rate": 9.688430471249967e-05, "loss": 0.0645, "step": 538 }, { "epoch": 0.404351087771943, "grad_norm": 0.12379523366689682, "learning_rate": 9.686151229548088e-05, "loss": 0.0446, "step": 539 }, { "epoch": 0.4051012753188297, "grad_norm": 0.11460956931114197, "learning_rate": 9.683863951558301e-05, "loss": 0.068, "step": 540 }, { "epoch": 0.40585146286571644, "grad_norm": 0.11864083260297775, "learning_rate": 9.681568641203068e-05, "loss": 0.0263, "step": 541 }, { "epoch": 0.40660165041260315, "grad_norm": 0.1245616003870964, "learning_rate": 9.679265302418615e-05, "loss": 0.054, "step": 542 }, { "epoch": 0.40735183795948987, "grad_norm": 0.16922646760940552, "learning_rate": 9.676953939154945e-05, "loss": 0.0824, "step": 543 }, { "epoch": 0.4081020255063766, "grad_norm": 0.24694877862930298, "learning_rate": 9.674634555375817e-05, "loss": 0.0666, "step": 544 }, { "epoch": 0.4088522130532633, "grad_norm": 0.1859944462776184, "learning_rate": 9.672307155058744e-05, "loss": 0.0959, "step": 545 }, { "epoch": 0.40960240060015, "grad_norm": 0.21233129501342773, "learning_rate": 9.669971742194992e-05, "loss": 0.1242, "step": 546 }, { "epoch": 0.41035258814703673, "grad_norm": 0.19170650839805603, "learning_rate": 9.667628320789562e-05, "loss": 0.0795, "step": 547 }, { "epoch": 0.4111027756939235, "grad_norm": 0.16862432658672333, "learning_rate": 9.665276894861188e-05, "loss": 0.0835, "step": 548 }, { "epoch": 0.4118529632408102, "grad_norm": 0.12009342759847641, "learning_rate": 9.66291746844234e-05, "loss": 0.0483, "step": 549 }, { "epoch": 0.41260315078769694, "grad_norm": 0.16073642671108246, "learning_rate": 9.660550045579199e-05, "loss": 0.0631, "step": 550 }, { "epoch": 0.41335333833458365, "grad_norm": 0.2920999825000763, "learning_rate": 9.65817463033166e-05, "loss": 0.097, "step": 551 }, { "epoch": 0.41410352588147037, "grad_norm": 0.14352647960186005, "learning_rate": 9.655791226773331e-05, "loss": 0.0654, "step": 552 }, { "epoch": 0.4148537134283571, "grad_norm": 0.16379228234291077, "learning_rate": 9.65339983899151e-05, "loss": 0.0574, "step": 553 }, { "epoch": 0.4156039009752438, "grad_norm": 0.15645262598991394, "learning_rate": 9.651000471087193e-05, "loss": 0.0556, "step": 554 }, { "epoch": 0.4163540885221305, "grad_norm": 0.1435929536819458, "learning_rate": 9.64859312717506e-05, "loss": 0.0597, "step": 555 }, { "epoch": 0.41710427606901723, "grad_norm": 0.12999999523162842, "learning_rate": 9.64617781138347e-05, "loss": 0.0598, "step": 556 }, { "epoch": 0.41785446361590395, "grad_norm": 0.18979759514331818, "learning_rate": 9.643754527854451e-05, "loss": 0.0857, "step": 557 }, { "epoch": 0.4186046511627907, "grad_norm": 0.18712787330150604, "learning_rate": 9.641323280743693e-05, "loss": 0.0692, "step": 558 }, { "epoch": 0.41935483870967744, "grad_norm": 0.12155728787183762, "learning_rate": 9.638884074220548e-05, "loss": 0.0728, "step": 559 }, { "epoch": 0.42010502625656415, "grad_norm": 0.16611698269844055, "learning_rate": 9.636436912468015e-05, "loss": 0.0585, "step": 560 }, { "epoch": 0.42085521380345087, "grad_norm": 0.1172047033905983, "learning_rate": 9.633981799682735e-05, "loss": 0.082, "step": 561 }, { "epoch": 0.4216054013503376, "grad_norm": 0.09718162566423416, "learning_rate": 9.631518740074985e-05, "loss": 0.0281, "step": 562 }, { "epoch": 0.4223555888972243, "grad_norm": 0.10215144604444504, "learning_rate": 9.629047737868669e-05, "loss": 0.0544, "step": 563 }, { "epoch": 0.423105776444111, "grad_norm": 0.1456746608018875, "learning_rate": 9.626568797301311e-05, "loss": 0.0673, "step": 564 }, { "epoch": 0.42385596399099773, "grad_norm": 0.1138940155506134, "learning_rate": 9.624081922624053e-05, "loss": 0.0516, "step": 565 }, { "epoch": 0.42460615153788445, "grad_norm": 0.2256423681974411, "learning_rate": 9.621587118101638e-05, "loss": 0.0729, "step": 566 }, { "epoch": 0.4253563390847712, "grad_norm": 0.159426748752594, "learning_rate": 9.619084388012412e-05, "loss": 0.0676, "step": 567 }, { "epoch": 0.42610652663165793, "grad_norm": 0.21540193259716034, "learning_rate": 9.616573736648308e-05, "loss": 0.0879, "step": 568 }, { "epoch": 0.42685671417854465, "grad_norm": 0.12239127606153488, "learning_rate": 9.61405516831485e-05, "loss": 0.0502, "step": 569 }, { "epoch": 0.42760690172543137, "grad_norm": 0.15514914691448212, "learning_rate": 9.61152868733113e-05, "loss": 0.0592, "step": 570 }, { "epoch": 0.4283570892723181, "grad_norm": 0.10405272245407104, "learning_rate": 9.608994298029818e-05, "loss": 0.0546, "step": 571 }, { "epoch": 0.4291072768192048, "grad_norm": 0.1854027807712555, "learning_rate": 9.60645200475714e-05, "loss": 0.0711, "step": 572 }, { "epoch": 0.4298574643660915, "grad_norm": 0.14749664068222046, "learning_rate": 9.603901811872877e-05, "loss": 0.0709, "step": 573 }, { "epoch": 0.43060765191297823, "grad_norm": 0.1849459558725357, "learning_rate": 9.601343723750363e-05, "loss": 0.0826, "step": 574 }, { "epoch": 0.43135783945986494, "grad_norm": 0.10557045042514801, "learning_rate": 9.598777744776464e-05, "loss": 0.0506, "step": 575 }, { "epoch": 0.43210802700675166, "grad_norm": 0.13303524255752563, "learning_rate": 9.596203879351582e-05, "loss": 0.0526, "step": 576 }, { "epoch": 0.43285821455363843, "grad_norm": 0.1565026342868805, "learning_rate": 9.593622131889643e-05, "loss": 0.099, "step": 577 }, { "epoch": 0.43360840210052515, "grad_norm": 0.12550555169582367, "learning_rate": 9.591032506818089e-05, "loss": 0.0525, "step": 578 }, { "epoch": 0.43435858964741186, "grad_norm": 0.14403165876865387, "learning_rate": 9.588435008577873e-05, "loss": 0.0613, "step": 579 }, { "epoch": 0.4351087771942986, "grad_norm": 0.144205242395401, "learning_rate": 9.585829641623448e-05, "loss": 0.0596, "step": 580 }, { "epoch": 0.4358589647411853, "grad_norm": 0.14015857875347137, "learning_rate": 9.583216410422762e-05, "loss": 0.0773, "step": 581 }, { "epoch": 0.436609152288072, "grad_norm": 0.15716828405857086, "learning_rate": 9.580595319457249e-05, "loss": 0.0545, "step": 582 }, { "epoch": 0.4373593398349587, "grad_norm": 0.25865277647972107, "learning_rate": 9.577966373221823e-05, "loss": 0.0885, "step": 583 }, { "epoch": 0.43810952738184544, "grad_norm": 0.21027599275112152, "learning_rate": 9.575329576224868e-05, "loss": 0.0704, "step": 584 }, { "epoch": 0.43885971492873216, "grad_norm": 0.1413678526878357, "learning_rate": 9.572684932988227e-05, "loss": 0.0464, "step": 585 }, { "epoch": 0.43960990247561893, "grad_norm": 0.1475217193365097, "learning_rate": 9.570032448047208e-05, "loss": 0.0648, "step": 586 }, { "epoch": 0.44036009002250565, "grad_norm": 0.1679753214120865, "learning_rate": 9.567372125950559e-05, "loss": 0.0583, "step": 587 }, { "epoch": 0.44111027756939236, "grad_norm": 0.13862156867980957, "learning_rate": 9.564703971260472e-05, "loss": 0.0648, "step": 588 }, { "epoch": 0.4418604651162791, "grad_norm": 0.12006936222314835, "learning_rate": 9.562027988552567e-05, "loss": 0.0477, "step": 589 }, { "epoch": 0.4426106526631658, "grad_norm": 0.2421911656856537, "learning_rate": 9.559344182415891e-05, "loss": 0.0996, "step": 590 }, { "epoch": 0.4433608402100525, "grad_norm": 0.16158467531204224, "learning_rate": 9.55665255745291e-05, "loss": 0.0735, "step": 591 }, { "epoch": 0.4441110277569392, "grad_norm": 0.17239175736904144, "learning_rate": 9.553953118279496e-05, "loss": 0.0827, "step": 592 }, { "epoch": 0.44486121530382594, "grad_norm": 0.2204049825668335, "learning_rate": 9.551245869524916e-05, "loss": 0.0696, "step": 593 }, { "epoch": 0.44561140285071266, "grad_norm": 0.11255641281604767, "learning_rate": 9.54853081583184e-05, "loss": 0.048, "step": 594 }, { "epoch": 0.4463615903975994, "grad_norm": 0.11511281132698059, "learning_rate": 9.545807961856317e-05, "loss": 0.0569, "step": 595 }, { "epoch": 0.44711177794448614, "grad_norm": 0.18176564574241638, "learning_rate": 9.543077312267773e-05, "loss": 0.0637, "step": 596 }, { "epoch": 0.44786196549137286, "grad_norm": 0.1667564958333969, "learning_rate": 9.540338871749002e-05, "loss": 0.0808, "step": 597 }, { "epoch": 0.4486121530382596, "grad_norm": 0.2754107117652893, "learning_rate": 9.537592644996162e-05, "loss": 0.102, "step": 598 }, { "epoch": 0.4493623405851463, "grad_norm": 0.10601028054952621, "learning_rate": 9.534838636718759e-05, "loss": 0.0475, "step": 599 }, { "epoch": 0.450112528132033, "grad_norm": 0.13959158957004547, "learning_rate": 9.532076851639649e-05, "loss": 0.0649, "step": 600 }, { "epoch": 0.450112528132033, "eval_loss": 0.07095802575349808, "eval_runtime": 2.656, "eval_samples_per_second": 20.332, "eval_steps_per_second": 5.271, "step": 600 }, { "epoch": 0.4508627156789197, "grad_norm": 0.08408086746931076, "learning_rate": 9.529307294495018e-05, "loss": 0.0446, "step": 601 }, { "epoch": 0.45161290322580644, "grad_norm": 0.1700592041015625, "learning_rate": 9.526529970034386e-05, "loss": 0.1001, "step": 602 }, { "epoch": 0.45236309077269315, "grad_norm": 0.16827765107154846, "learning_rate": 9.52374488302059e-05, "loss": 0.0726, "step": 603 }, { "epoch": 0.45311327831957987, "grad_norm": 0.1284192055463791, "learning_rate": 9.52095203822978e-05, "loss": 0.0514, "step": 604 }, { "epoch": 0.45386346586646664, "grad_norm": 0.1511625349521637, "learning_rate": 9.518151440451411e-05, "loss": 0.0658, "step": 605 }, { "epoch": 0.45461365341335336, "grad_norm": 0.14977985620498657, "learning_rate": 9.515343094488232e-05, "loss": 0.0655, "step": 606 }, { "epoch": 0.4553638409602401, "grad_norm": 0.20794422924518585, "learning_rate": 9.51252700515628e-05, "loss": 0.0732, "step": 607 }, { "epoch": 0.4561140285071268, "grad_norm": 0.18329943716526031, "learning_rate": 9.509703177284869e-05, "loss": 0.0909, "step": 608 }, { "epoch": 0.4568642160540135, "grad_norm": 0.19080418348312378, "learning_rate": 9.506871615716587e-05, "loss": 0.0964, "step": 609 }, { "epoch": 0.4576144036009002, "grad_norm": 0.15063661336898804, "learning_rate": 9.504032325307284e-05, "loss": 0.0554, "step": 610 }, { "epoch": 0.45836459114778694, "grad_norm": 0.18503394722938538, "learning_rate": 9.501185310926062e-05, "loss": 0.0725, "step": 611 }, { "epoch": 0.45911477869467365, "grad_norm": 0.16494551301002502, "learning_rate": 9.498330577455273e-05, "loss": 0.0653, "step": 612 }, { "epoch": 0.45986496624156037, "grad_norm": 0.14555247128009796, "learning_rate": 9.495468129790499e-05, "loss": 0.062, "step": 613 }, { "epoch": 0.4606151537884471, "grad_norm": 0.13288968801498413, "learning_rate": 9.49259797284056e-05, "loss": 0.0678, "step": 614 }, { "epoch": 0.46136534133533386, "grad_norm": 0.17785833775997162, "learning_rate": 9.489720111527492e-05, "loss": 0.1018, "step": 615 }, { "epoch": 0.46211552888222057, "grad_norm": 0.156355082988739, "learning_rate": 9.486834550786543e-05, "loss": 0.0651, "step": 616 }, { "epoch": 0.4628657164291073, "grad_norm": 0.09945828467607498, "learning_rate": 9.483941295566165e-05, "loss": 0.0455, "step": 617 }, { "epoch": 0.463615903975994, "grad_norm": 0.15181989967823029, "learning_rate": 9.481040350828006e-05, "loss": 0.0414, "step": 618 }, { "epoch": 0.4643660915228807, "grad_norm": 0.1600968986749649, "learning_rate": 9.4781317215469e-05, "loss": 0.0601, "step": 619 }, { "epoch": 0.46511627906976744, "grad_norm": 0.14700649678707123, "learning_rate": 9.475215412710864e-05, "loss": 0.0588, "step": 620 }, { "epoch": 0.46586646661665415, "grad_norm": 0.11616702377796173, "learning_rate": 9.472291429321075e-05, "loss": 0.0521, "step": 621 }, { "epoch": 0.46661665416354087, "grad_norm": 0.1925794631242752, "learning_rate": 9.469359776391879e-05, "loss": 0.0885, "step": 622 }, { "epoch": 0.4673668417104276, "grad_norm": 0.22271235287189484, "learning_rate": 9.466420458950773e-05, "loss": 0.1039, "step": 623 }, { "epoch": 0.46811702925731435, "grad_norm": 0.13130241632461548, "learning_rate": 9.463473482038395e-05, "loss": 0.0556, "step": 624 }, { "epoch": 0.46886721680420107, "grad_norm": 0.20446011424064636, "learning_rate": 9.46051885070852e-05, "loss": 0.0842, "step": 625 }, { "epoch": 0.4696174043510878, "grad_norm": 0.17276065051555634, "learning_rate": 9.457556570028052e-05, "loss": 0.0651, "step": 626 }, { "epoch": 0.4703675918979745, "grad_norm": 0.14631524682044983, "learning_rate": 9.454586645077011e-05, "loss": 0.0638, "step": 627 }, { "epoch": 0.4711177794448612, "grad_norm": 0.3121795058250427, "learning_rate": 9.451609080948522e-05, "loss": 0.1204, "step": 628 }, { "epoch": 0.47186796699174793, "grad_norm": 0.12264740467071533, "learning_rate": 9.448623882748817e-05, "loss": 0.0624, "step": 629 }, { "epoch": 0.47261815453863465, "grad_norm": 0.09743326157331467, "learning_rate": 9.445631055597217e-05, "loss": 0.06, "step": 630 }, { "epoch": 0.47336834208552137, "grad_norm": 0.18968532979488373, "learning_rate": 9.442630604626126e-05, "loss": 0.1011, "step": 631 }, { "epoch": 0.4741185296324081, "grad_norm": 0.10481397807598114, "learning_rate": 9.43962253498102e-05, "loss": 0.0544, "step": 632 }, { "epoch": 0.4748687171792948, "grad_norm": 0.12469401955604553, "learning_rate": 9.436606851820444e-05, "loss": 0.0551, "step": 633 }, { "epoch": 0.47561890472618157, "grad_norm": 0.16171766817569733, "learning_rate": 9.433583560315999e-05, "loss": 0.073, "step": 634 }, { "epoch": 0.4763690922730683, "grad_norm": 0.12446840107440948, "learning_rate": 9.430552665652328e-05, "loss": 0.0601, "step": 635 }, { "epoch": 0.477119279819955, "grad_norm": 0.1368139535188675, "learning_rate": 9.427514173027121e-05, "loss": 0.0547, "step": 636 }, { "epoch": 0.4778694673668417, "grad_norm": 0.1330532282590866, "learning_rate": 9.424468087651092e-05, "loss": 0.0536, "step": 637 }, { "epoch": 0.47861965491372843, "grad_norm": 0.13870249688625336, "learning_rate": 9.421414414747978e-05, "loss": 0.0439, "step": 638 }, { "epoch": 0.47936984246061515, "grad_norm": 0.11789939552545547, "learning_rate": 9.418353159554526e-05, "loss": 0.0601, "step": 639 }, { "epoch": 0.48012003000750186, "grad_norm": 0.1677863597869873, "learning_rate": 9.415284327320489e-05, "loss": 0.0574, "step": 640 }, { "epoch": 0.4808702175543886, "grad_norm": 0.12923505902290344, "learning_rate": 9.41220792330861e-05, "loss": 0.0646, "step": 641 }, { "epoch": 0.4816204051012753, "grad_norm": 0.16255703568458557, "learning_rate": 9.40912395279462e-05, "loss": 0.1003, "step": 642 }, { "epoch": 0.48237059264816207, "grad_norm": 0.12949885427951813, "learning_rate": 9.406032421067224e-05, "loss": 0.0509, "step": 643 }, { "epoch": 0.4831207801950488, "grad_norm": 0.1707848310470581, "learning_rate": 9.402933333428097e-05, "loss": 0.0737, "step": 644 }, { "epoch": 0.4838709677419355, "grad_norm": 0.14833003282546997, "learning_rate": 9.399826695191868e-05, "loss": 0.0637, "step": 645 }, { "epoch": 0.4846211552888222, "grad_norm": 0.140524223446846, "learning_rate": 9.396712511686114e-05, "loss": 0.0627, "step": 646 }, { "epoch": 0.48537134283570893, "grad_norm": 0.13231976330280304, "learning_rate": 9.393590788251354e-05, "loss": 0.0627, "step": 647 }, { "epoch": 0.48612153038259565, "grad_norm": 0.1312929391860962, "learning_rate": 9.390461530241037e-05, "loss": 0.0447, "step": 648 }, { "epoch": 0.48687171792948236, "grad_norm": 0.2141118049621582, "learning_rate": 9.38732474302153e-05, "loss": 0.0775, "step": 649 }, { "epoch": 0.4876219054763691, "grad_norm": 0.19238243997097015, "learning_rate": 9.384180431972119e-05, "loss": 0.0575, "step": 650 }, { "epoch": 0.4883720930232558, "grad_norm": 0.13290603458881378, "learning_rate": 9.381028602484984e-05, "loss": 0.0645, "step": 651 }, { "epoch": 0.4891222805701425, "grad_norm": 0.1605457365512848, "learning_rate": 9.377869259965202e-05, "loss": 0.0623, "step": 652 }, { "epoch": 0.4898724681170293, "grad_norm": 0.17835864424705505, "learning_rate": 9.374702409830736e-05, "loss": 0.0992, "step": 653 }, { "epoch": 0.490622655663916, "grad_norm": 0.2935710549354553, "learning_rate": 9.37152805751242e-05, "loss": 0.0668, "step": 654 }, { "epoch": 0.4913728432108027, "grad_norm": 0.19382020831108093, "learning_rate": 9.36834620845396e-05, "loss": 0.0645, "step": 655 }, { "epoch": 0.49212303075768943, "grad_norm": 0.1444290429353714, "learning_rate": 9.365156868111908e-05, "loss": 0.0612, "step": 656 }, { "epoch": 0.49287321830457614, "grad_norm": 0.12895061075687408, "learning_rate": 9.361960041955672e-05, "loss": 0.0468, "step": 657 }, { "epoch": 0.49362340585146286, "grad_norm": 0.13367299735546112, "learning_rate": 9.358755735467494e-05, "loss": 0.0663, "step": 658 }, { "epoch": 0.4943735933983496, "grad_norm": 0.17381970584392548, "learning_rate": 9.355543954142446e-05, "loss": 0.0857, "step": 659 }, { "epoch": 0.4951237809452363, "grad_norm": 0.1889047920703888, "learning_rate": 9.352324703488412e-05, "loss": 0.0968, "step": 660 }, { "epoch": 0.495873968492123, "grad_norm": 0.16304278373718262, "learning_rate": 9.349097989026093e-05, "loss": 0.0682, "step": 661 }, { "epoch": 0.4966241560390098, "grad_norm": 0.19829104840755463, "learning_rate": 9.345863816288985e-05, "loss": 0.0684, "step": 662 }, { "epoch": 0.4973743435858965, "grad_norm": 0.15854620933532715, "learning_rate": 9.342622190823378e-05, "loss": 0.0664, "step": 663 }, { "epoch": 0.4981245311327832, "grad_norm": 0.25643864274024963, "learning_rate": 9.339373118188338e-05, "loss": 0.0805, "step": 664 }, { "epoch": 0.4988747186796699, "grad_norm": 0.13145574927330017, "learning_rate": 9.336116603955707e-05, "loss": 0.0526, "step": 665 }, { "epoch": 0.49962490622655664, "grad_norm": 0.13013754785060883, "learning_rate": 9.332852653710084e-05, "loss": 0.0505, "step": 666 }, { "epoch": 0.5003750937734434, "grad_norm": 0.1300058811903, "learning_rate": 9.329581273048822e-05, "loss": 0.048, "step": 667 }, { "epoch": 0.5011252813203301, "grad_norm": 0.15247949957847595, "learning_rate": 9.32630246758202e-05, "loss": 0.0657, "step": 668 }, { "epoch": 0.5018754688672168, "grad_norm": 0.1410437822341919, "learning_rate": 9.323016242932504e-05, "loss": 0.0936, "step": 669 }, { "epoch": 0.5026256564141035, "grad_norm": 0.12135988473892212, "learning_rate": 9.319722604735825e-05, "loss": 0.0641, "step": 670 }, { "epoch": 0.5033758439609902, "grad_norm": 0.15133972465991974, "learning_rate": 9.31642155864025e-05, "loss": 0.0675, "step": 671 }, { "epoch": 0.5041260315078769, "grad_norm": 0.1304636150598526, "learning_rate": 9.313113110306748e-05, "loss": 0.0479, "step": 672 }, { "epoch": 0.5048762190547637, "grad_norm": 0.1356845498085022, "learning_rate": 9.309797265408979e-05, "loss": 0.0588, "step": 673 }, { "epoch": 0.5056264066016504, "grad_norm": 0.1584395468235016, "learning_rate": 9.306474029633294e-05, "loss": 0.0543, "step": 674 }, { "epoch": 0.5063765941485371, "grad_norm": 0.16862989962100983, "learning_rate": 9.303143408678716e-05, "loss": 0.0693, "step": 675 }, { "epoch": 0.5071267816954238, "grad_norm": 0.12757883965969086, "learning_rate": 9.299805408256928e-05, "loss": 0.0376, "step": 676 }, { "epoch": 0.5078769692423106, "grad_norm": 0.1599583476781845, "learning_rate": 9.296460034092274e-05, "loss": 0.0593, "step": 677 }, { "epoch": 0.5086271567891973, "grad_norm": 0.12639591097831726, "learning_rate": 9.293107291921741e-05, "loss": 0.0675, "step": 678 }, { "epoch": 0.5093773443360841, "grad_norm": 0.10506059974431992, "learning_rate": 9.289747187494952e-05, "loss": 0.0365, "step": 679 }, { "epoch": 0.5101275318829708, "grad_norm": 0.1758994609117508, "learning_rate": 9.286379726574155e-05, "loss": 0.0484, "step": 680 }, { "epoch": 0.5108777194298575, "grad_norm": 0.1311517059803009, "learning_rate": 9.283004914934215e-05, "loss": 0.0524, "step": 681 }, { "epoch": 0.5116279069767442, "grad_norm": 0.1752699762582779, "learning_rate": 9.2796227583626e-05, "loss": 0.0897, "step": 682 }, { "epoch": 0.5123780945236309, "grad_norm": 0.15679432451725006, "learning_rate": 9.276233262659375e-05, "loss": 0.058, "step": 683 }, { "epoch": 0.5131282820705176, "grad_norm": 0.15536212921142578, "learning_rate": 9.272836433637193e-05, "loss": 0.0739, "step": 684 }, { "epoch": 0.5138784696174044, "grad_norm": 0.15193812549114227, "learning_rate": 9.269432277121281e-05, "loss": 0.055, "step": 685 }, { "epoch": 0.5146286571642911, "grad_norm": 0.1312589794397354, "learning_rate": 9.266020798949433e-05, "loss": 0.0463, "step": 686 }, { "epoch": 0.5153788447111778, "grad_norm": 0.16843152046203613, "learning_rate": 9.262602004971996e-05, "loss": 0.065, "step": 687 }, { "epoch": 0.5161290322580645, "grad_norm": 0.12552760541439056, "learning_rate": 9.259175901051867e-05, "loss": 0.0533, "step": 688 }, { "epoch": 0.5168792198049512, "grad_norm": 0.18700753152370453, "learning_rate": 9.255742493064474e-05, "loss": 0.0739, "step": 689 }, { "epoch": 0.5176294073518379, "grad_norm": 0.24483560025691986, "learning_rate": 9.252301786897776e-05, "loss": 0.0679, "step": 690 }, { "epoch": 0.5183795948987246, "grad_norm": 0.20147421956062317, "learning_rate": 9.248853788452247e-05, "loss": 0.0753, "step": 691 }, { "epoch": 0.5191297824456114, "grad_norm": 0.18182440102100372, "learning_rate": 9.24539850364086e-05, "loss": 0.0444, "step": 692 }, { "epoch": 0.5198799699924981, "grad_norm": 0.24294641613960266, "learning_rate": 9.241935938389093e-05, "loss": 0.0712, "step": 693 }, { "epoch": 0.5206301575393848, "grad_norm": 0.1470489203929901, "learning_rate": 9.238466098634902e-05, "loss": 0.0557, "step": 694 }, { "epoch": 0.5213803450862715, "grad_norm": 0.19487878680229187, "learning_rate": 9.234988990328719e-05, "loss": 0.0826, "step": 695 }, { "epoch": 0.5221305326331583, "grad_norm": 0.13695645332336426, "learning_rate": 9.231504619433445e-05, "loss": 0.0662, "step": 696 }, { "epoch": 0.522880720180045, "grad_norm": 0.2282414436340332, "learning_rate": 9.228012991924433e-05, "loss": 0.0939, "step": 697 }, { "epoch": 0.5236309077269318, "grad_norm": 0.154418483376503, "learning_rate": 9.224514113789477e-05, "loss": 0.0622, "step": 698 }, { "epoch": 0.5243810952738185, "grad_norm": 0.2042827010154724, "learning_rate": 9.221007991028814e-05, "loss": 0.0698, "step": 699 }, { "epoch": 0.5251312828207052, "grad_norm": 0.1465548425912857, "learning_rate": 9.217494629655094e-05, "loss": 0.0468, "step": 700 }, { "epoch": 0.5258814703675919, "grad_norm": 0.12249326705932617, "learning_rate": 9.213974035693389e-05, "loss": 0.0453, "step": 701 }, { "epoch": 0.5266316579144786, "grad_norm": 0.11008257418870926, "learning_rate": 9.21044621518117e-05, "loss": 0.0506, "step": 702 }, { "epoch": 0.5273818454613654, "grad_norm": 0.19281499087810516, "learning_rate": 9.206911174168301e-05, "loss": 0.0797, "step": 703 }, { "epoch": 0.5281320330082521, "grad_norm": 0.16175544261932373, "learning_rate": 9.20336891871703e-05, "loss": 0.0649, "step": 704 }, { "epoch": 0.5288822205551388, "grad_norm": 0.15684905648231506, "learning_rate": 9.199819454901977e-05, "loss": 0.0677, "step": 705 }, { "epoch": 0.5296324081020255, "grad_norm": 0.2674317955970764, "learning_rate": 9.196262788810121e-05, "loss": 0.1118, "step": 706 }, { "epoch": 0.5303825956489122, "grad_norm": 0.1423470377922058, "learning_rate": 9.192698926540795e-05, "loss": 0.0764, "step": 707 }, { "epoch": 0.5311327831957989, "grad_norm": 0.16944824159145355, "learning_rate": 9.189127874205674e-05, "loss": 0.0689, "step": 708 }, { "epoch": 0.5318829707426856, "grad_norm": 0.19155418872833252, "learning_rate": 9.185549637928758e-05, "loss": 0.0654, "step": 709 }, { "epoch": 0.5326331582895724, "grad_norm": 0.16647079586982727, "learning_rate": 9.181964223846371e-05, "loss": 0.0961, "step": 710 }, { "epoch": 0.5333833458364591, "grad_norm": 0.1647406965494156, "learning_rate": 9.178371638107146e-05, "loss": 0.0745, "step": 711 }, { "epoch": 0.5341335333833458, "grad_norm": 0.17723067104816437, "learning_rate": 9.174771886872011e-05, "loss": 0.0557, "step": 712 }, { "epoch": 0.5348837209302325, "grad_norm": 0.206720232963562, "learning_rate": 9.17116497631419e-05, "loss": 0.0733, "step": 713 }, { "epoch": 0.5356339084771192, "grad_norm": 0.16216231882572174, "learning_rate": 9.167550912619173e-05, "loss": 0.0576, "step": 714 }, { "epoch": 0.536384096024006, "grad_norm": 0.11692114174365997, "learning_rate": 9.16392970198473e-05, "loss": 0.0526, "step": 715 }, { "epoch": 0.5371342835708928, "grad_norm": 0.13061407208442688, "learning_rate": 9.160301350620875e-05, "loss": 0.0599, "step": 716 }, { "epoch": 0.5378844711177795, "grad_norm": 0.10589703917503357, "learning_rate": 9.156665864749876e-05, "loss": 0.0689, "step": 717 }, { "epoch": 0.5386346586646662, "grad_norm": 0.12243220210075378, "learning_rate": 9.153023250606234e-05, "loss": 0.0537, "step": 718 }, { "epoch": 0.5393848462115529, "grad_norm": 0.18423284590244293, "learning_rate": 9.14937351443667e-05, "loss": 0.1045, "step": 719 }, { "epoch": 0.5401350337584396, "grad_norm": 0.17368818819522858, "learning_rate": 9.145716662500126e-05, "loss": 0.0825, "step": 720 }, { "epoch": 0.5408852213053263, "grad_norm": 0.1663954257965088, "learning_rate": 9.142052701067741e-05, "loss": 0.1131, "step": 721 }, { "epoch": 0.5416354088522131, "grad_norm": 0.19684909284114838, "learning_rate": 9.13838163642285e-05, "loss": 0.0925, "step": 722 }, { "epoch": 0.5423855963990998, "grad_norm": 0.18679098784923553, "learning_rate": 9.134703474860963e-05, "loss": 0.0513, "step": 723 }, { "epoch": 0.5431357839459865, "grad_norm": 0.1188245639204979, "learning_rate": 9.13101822268977e-05, "loss": 0.0636, "step": 724 }, { "epoch": 0.5438859714928732, "grad_norm": 0.25207948684692383, "learning_rate": 9.127325886229115e-05, "loss": 0.0907, "step": 725 }, { "epoch": 0.5446361590397599, "grad_norm": 0.13716183602809906, "learning_rate": 9.123626471810988e-05, "loss": 0.0533, "step": 726 }, { "epoch": 0.5453863465866466, "grad_norm": 0.3036738634109497, "learning_rate": 9.119919985779521e-05, "loss": 0.1033, "step": 727 }, { "epoch": 0.5461365341335334, "grad_norm": 0.18584197759628296, "learning_rate": 9.116206434490976e-05, "loss": 0.1002, "step": 728 }, { "epoch": 0.5468867216804201, "grad_norm": 0.21791420876979828, "learning_rate": 9.112485824313726e-05, "loss": 0.0672, "step": 729 }, { "epoch": 0.5476369092273068, "grad_norm": 0.29374638199806213, "learning_rate": 9.10875816162825e-05, "loss": 0.0671, "step": 730 }, { "epoch": 0.5483870967741935, "grad_norm": 0.12101375311613083, "learning_rate": 9.105023452827121e-05, "loss": 0.0599, "step": 731 }, { "epoch": 0.5491372843210802, "grad_norm": 0.13263867795467377, "learning_rate": 9.101281704315002e-05, "loss": 0.0619, "step": 732 }, { "epoch": 0.5498874718679669, "grad_norm": 0.17979863286018372, "learning_rate": 9.097532922508619e-05, "loss": 0.0669, "step": 733 }, { "epoch": 0.5506376594148538, "grad_norm": 0.14049032330513, "learning_rate": 9.093777113836765e-05, "loss": 0.0643, "step": 734 }, { "epoch": 0.5513878469617405, "grad_norm": 0.1863878220319748, "learning_rate": 9.090014284740283e-05, "loss": 0.0684, "step": 735 }, { "epoch": 0.5521380345086272, "grad_norm": 0.13345694541931152, "learning_rate": 9.086244441672052e-05, "loss": 0.061, "step": 736 }, { "epoch": 0.5528882220555139, "grad_norm": 0.2068844437599182, "learning_rate": 9.082467591096982e-05, "loss": 0.0801, "step": 737 }, { "epoch": 0.5536384096024006, "grad_norm": 0.12110719829797745, "learning_rate": 9.078683739492002e-05, "loss": 0.0769, "step": 738 }, { "epoch": 0.5543885971492873, "grad_norm": 0.2093079388141632, "learning_rate": 9.074892893346043e-05, "loss": 0.0782, "step": 739 }, { "epoch": 0.5551387846961741, "grad_norm": 0.12246458232402802, "learning_rate": 9.071095059160035e-05, "loss": 0.0521, "step": 740 }, { "epoch": 0.5558889722430608, "grad_norm": 0.115948386490345, "learning_rate": 9.067290243446887e-05, "loss": 0.0497, "step": 741 }, { "epoch": 0.5566391597899475, "grad_norm": 0.16100749373435974, "learning_rate": 9.063478452731484e-05, "loss": 0.0615, "step": 742 }, { "epoch": 0.5573893473368342, "grad_norm": 0.1973392218351364, "learning_rate": 9.059659693550673e-05, "loss": 0.0736, "step": 743 }, { "epoch": 0.5581395348837209, "grad_norm": 0.12596265971660614, "learning_rate": 9.055833972453249e-05, "loss": 0.0725, "step": 744 }, { "epoch": 0.5588897224306076, "grad_norm": 0.22205737233161926, "learning_rate": 9.052001295999947e-05, "loss": 0.1457, "step": 745 }, { "epoch": 0.5596399099774944, "grad_norm": 0.15573537349700928, "learning_rate": 9.048161670763429e-05, "loss": 0.0668, "step": 746 }, { "epoch": 0.5603900975243811, "grad_norm": 0.1514582484960556, "learning_rate": 9.044315103328276e-05, "loss": 0.0452, "step": 747 }, { "epoch": 0.5611402850712678, "grad_norm": 0.12557372450828552, "learning_rate": 9.04046160029097e-05, "loss": 0.0483, "step": 748 }, { "epoch": 0.5618904726181545, "grad_norm": 0.188478484749794, "learning_rate": 9.036601168259893e-05, "loss": 0.0584, "step": 749 }, { "epoch": 0.5626406601650412, "grad_norm": 0.18197283148765564, "learning_rate": 9.032733813855301e-05, "loss": 0.078, "step": 750 }, { "epoch": 0.5633908477119279, "grad_norm": 0.20598235726356506, "learning_rate": 9.02885954370933e-05, "loss": 0.0575, "step": 751 }, { "epoch": 0.5641410352588146, "grad_norm": 0.2001180350780487, "learning_rate": 9.02497836446597e-05, "loss": 0.0912, "step": 752 }, { "epoch": 0.5648912228057015, "grad_norm": 0.12836728990077972, "learning_rate": 9.021090282781059e-05, "loss": 0.0592, "step": 753 }, { "epoch": 0.5656414103525882, "grad_norm": 0.15549710392951965, "learning_rate": 9.01719530532228e-05, "loss": 0.0955, "step": 754 }, { "epoch": 0.5663915978994749, "grad_norm": 0.10522754490375519, "learning_rate": 9.01329343876913e-05, "loss": 0.0417, "step": 755 }, { "epoch": 0.5671417854463616, "grad_norm": 0.17558157444000244, "learning_rate": 9.009384689812928e-05, "loss": 0.0686, "step": 756 }, { "epoch": 0.5678919729932483, "grad_norm": 0.129380464553833, "learning_rate": 9.005469065156795e-05, "loss": 0.0614, "step": 757 }, { "epoch": 0.568642160540135, "grad_norm": 0.2937568426132202, "learning_rate": 9.00154657151564e-05, "loss": 0.1039, "step": 758 }, { "epoch": 0.5693923480870218, "grad_norm": 0.10850602388381958, "learning_rate": 8.997617215616154e-05, "loss": 0.0518, "step": 759 }, { "epoch": 0.5701425356339085, "grad_norm": 0.11563457548618317, "learning_rate": 8.993681004196797e-05, "loss": 0.0513, "step": 760 }, { "epoch": 0.5708927231807952, "grad_norm": 0.12018301337957382, "learning_rate": 8.989737944007781e-05, "loss": 0.0354, "step": 761 }, { "epoch": 0.5716429107276819, "grad_norm": 0.16475611925125122, "learning_rate": 8.985788041811068e-05, "loss": 0.0801, "step": 762 }, { "epoch": 0.5723930982745686, "grad_norm": 0.1512751430273056, "learning_rate": 8.981831304380348e-05, "loss": 0.0735, "step": 763 }, { "epoch": 0.5731432858214554, "grad_norm": 0.12887828052043915, "learning_rate": 8.97786773850104e-05, "loss": 0.0511, "step": 764 }, { "epoch": 0.5738934733683421, "grad_norm": 0.11312253773212433, "learning_rate": 8.973897350970269e-05, "loss": 0.0501, "step": 765 }, { "epoch": 0.5746436609152288, "grad_norm": 0.20769958198070526, "learning_rate": 8.969920148596857e-05, "loss": 0.1016, "step": 766 }, { "epoch": 0.5753938484621155, "grad_norm": 0.15098562836647034, "learning_rate": 8.965936138201314e-05, "loss": 0.0648, "step": 767 }, { "epoch": 0.5761440360090022, "grad_norm": 0.16101424396038055, "learning_rate": 8.961945326615829e-05, "loss": 0.0829, "step": 768 }, { "epoch": 0.5768942235558889, "grad_norm": 0.14836345613002777, "learning_rate": 8.957947720684246e-05, "loss": 0.0691, "step": 769 }, { "epoch": 0.5776444111027756, "grad_norm": 0.19855999946594238, "learning_rate": 8.953943327262066e-05, "loss": 0.0712, "step": 770 }, { "epoch": 0.5783945986496624, "grad_norm": 0.14603960514068604, "learning_rate": 8.949932153216434e-05, "loss": 0.0685, "step": 771 }, { "epoch": 0.5791447861965492, "grad_norm": 0.09962477535009384, "learning_rate": 8.945914205426116e-05, "loss": 0.0453, "step": 772 }, { "epoch": 0.5798949737434359, "grad_norm": 0.17665497958660126, "learning_rate": 8.941889490781494e-05, "loss": 0.0738, "step": 773 }, { "epoch": 0.5806451612903226, "grad_norm": 0.16767451167106628, "learning_rate": 8.937858016184563e-05, "loss": 0.0766, "step": 774 }, { "epoch": 0.5813953488372093, "grad_norm": 0.14359025657176971, "learning_rate": 8.933819788548899e-05, "loss": 0.0755, "step": 775 }, { "epoch": 0.582145536384096, "grad_norm": 0.07473455369472504, "learning_rate": 8.92977481479967e-05, "loss": 0.0429, "step": 776 }, { "epoch": 0.5828957239309828, "grad_norm": 0.18343903124332428, "learning_rate": 8.925723101873603e-05, "loss": 0.0764, "step": 777 }, { "epoch": 0.5836459114778695, "grad_norm": 0.11543784290552139, "learning_rate": 8.92166465671899e-05, "loss": 0.0609, "step": 778 }, { "epoch": 0.5843960990247562, "grad_norm": 0.20404362678527832, "learning_rate": 8.917599486295664e-05, "loss": 0.0951, "step": 779 }, { "epoch": 0.5851462865716429, "grad_norm": 0.16599202156066895, "learning_rate": 8.913527597574991e-05, "loss": 0.0814, "step": 780 }, { "epoch": 0.5858964741185296, "grad_norm": 0.11812645941972733, "learning_rate": 8.90944899753986e-05, "loss": 0.042, "step": 781 }, { "epoch": 0.5866466616654163, "grad_norm": 0.11141446232795715, "learning_rate": 8.905363693184668e-05, "loss": 0.0387, "step": 782 }, { "epoch": 0.5873968492123031, "grad_norm": 0.22815898060798645, "learning_rate": 8.901271691515309e-05, "loss": 0.0869, "step": 783 }, { "epoch": 0.5881470367591898, "grad_norm": 0.09209627658128738, "learning_rate": 8.897172999549165e-05, "loss": 0.0434, "step": 784 }, { "epoch": 0.5888972243060765, "grad_norm": 0.12892037630081177, "learning_rate": 8.893067624315088e-05, "loss": 0.0413, "step": 785 }, { "epoch": 0.5896474118529632, "grad_norm": 0.16496118903160095, "learning_rate": 8.888955572853392e-05, "loss": 0.0676, "step": 786 }, { "epoch": 0.5903975993998499, "grad_norm": 0.12853941321372986, "learning_rate": 8.884836852215841e-05, "loss": 0.0605, "step": 787 }, { "epoch": 0.5911477869467366, "grad_norm": 0.1408003568649292, "learning_rate": 8.880711469465635e-05, "loss": 0.0572, "step": 788 }, { "epoch": 0.5918979744936234, "grad_norm": 0.1078130304813385, "learning_rate": 8.876579431677398e-05, "loss": 0.0444, "step": 789 }, { "epoch": 0.5926481620405101, "grad_norm": 0.1855546236038208, "learning_rate": 8.87244074593717e-05, "loss": 0.0624, "step": 790 }, { "epoch": 0.5933983495873969, "grad_norm": 0.15005631744861603, "learning_rate": 8.868295419342389e-05, "loss": 0.0554, "step": 791 }, { "epoch": 0.5941485371342836, "grad_norm": 0.18375401198863983, "learning_rate": 8.86414345900188e-05, "loss": 0.1016, "step": 792 }, { "epoch": 0.5948987246811703, "grad_norm": 0.1501947045326233, "learning_rate": 8.859984872035849e-05, "loss": 0.0645, "step": 793 }, { "epoch": 0.595648912228057, "grad_norm": 0.15210017561912537, "learning_rate": 8.85581966557586e-05, "loss": 0.0739, "step": 794 }, { "epoch": 0.5963990997749438, "grad_norm": 0.1773383766412735, "learning_rate": 8.851647846764835e-05, "loss": 0.0686, "step": 795 }, { "epoch": 0.5971492873218305, "grad_norm": 0.23530487716197968, "learning_rate": 8.847469422757031e-05, "loss": 0.0664, "step": 796 }, { "epoch": 0.5978994748687172, "grad_norm": 0.10172255337238312, "learning_rate": 8.843284400718033e-05, "loss": 0.0416, "step": 797 }, { "epoch": 0.5986496624156039, "grad_norm": 0.14675600826740265, "learning_rate": 8.839092787824743e-05, "loss": 0.0651, "step": 798 }, { "epoch": 0.5993998499624906, "grad_norm": 0.09715414047241211, "learning_rate": 8.834894591265364e-05, "loss": 0.0453, "step": 799 }, { "epoch": 0.6001500375093773, "grad_norm": 0.11148321628570557, "learning_rate": 8.830689818239388e-05, "loss": 0.0468, "step": 800 }, { "epoch": 0.6001500375093773, "eval_loss": 0.07001598179340363, "eval_runtime": 2.6621, "eval_samples_per_second": 20.285, "eval_steps_per_second": 5.259, "step": 800 }, { "epoch": 0.6009002250562641, "grad_norm": 0.11450861394405365, "learning_rate": 8.826478475957589e-05, "loss": 0.0401, "step": 801 }, { "epoch": 0.6016504126031508, "grad_norm": 0.10207905620336533, "learning_rate": 8.822260571642005e-05, "loss": 0.0511, "step": 802 }, { "epoch": 0.6024006001500375, "grad_norm": 0.09535292536020279, "learning_rate": 8.818036112525924e-05, "loss": 0.0613, "step": 803 }, { "epoch": 0.6031507876969242, "grad_norm": 0.18865959346294403, "learning_rate": 8.813805105853879e-05, "loss": 0.0744, "step": 804 }, { "epoch": 0.6039009752438109, "grad_norm": 0.11314369738101959, "learning_rate": 8.809567558881628e-05, "loss": 0.0449, "step": 805 }, { "epoch": 0.6046511627906976, "grad_norm": 0.1716671735048294, "learning_rate": 8.805323478876149e-05, "loss": 0.0764, "step": 806 }, { "epoch": 0.6054013503375844, "grad_norm": 0.15436431765556335, "learning_rate": 8.80107287311562e-05, "loss": 0.0697, "step": 807 }, { "epoch": 0.6061515378844711, "grad_norm": 0.13632182776927948, "learning_rate": 8.796815748889413e-05, "loss": 0.0456, "step": 808 }, { "epoch": 0.6069017254313578, "grad_norm": 0.10250575095415115, "learning_rate": 8.792552113498073e-05, "loss": 0.0502, "step": 809 }, { "epoch": 0.6076519129782446, "grad_norm": 0.22113870084285736, "learning_rate": 8.788281974253318e-05, "loss": 0.1038, "step": 810 }, { "epoch": 0.6084021005251313, "grad_norm": 0.14202985167503357, "learning_rate": 8.784005338478017e-05, "loss": 0.073, "step": 811 }, { "epoch": 0.609152288072018, "grad_norm": 0.15260817110538483, "learning_rate": 8.779722213506178e-05, "loss": 0.0769, "step": 812 }, { "epoch": 0.6099024756189048, "grad_norm": 0.20531697571277618, "learning_rate": 8.775432606682937e-05, "loss": 0.109, "step": 813 }, { "epoch": 0.6106526631657915, "grad_norm": 0.19820401072502136, "learning_rate": 8.77113652536455e-05, "loss": 0.0571, "step": 814 }, { "epoch": 0.6114028507126782, "grad_norm": 0.23314431309700012, "learning_rate": 8.766833976918371e-05, "loss": 0.0665, "step": 815 }, { "epoch": 0.6121530382595649, "grad_norm": 0.20076413452625275, "learning_rate": 8.76252496872285e-05, "loss": 0.076, "step": 816 }, { "epoch": 0.6129032258064516, "grad_norm": 0.11270924657583237, "learning_rate": 8.758209508167508e-05, "loss": 0.0573, "step": 817 }, { "epoch": 0.6136534133533383, "grad_norm": 0.16007684171199799, "learning_rate": 8.753887602652937e-05, "loss": 0.0711, "step": 818 }, { "epoch": 0.614403600900225, "grad_norm": 0.15355923771858215, "learning_rate": 8.74955925959078e-05, "loss": 0.0617, "step": 819 }, { "epoch": 0.6151537884471118, "grad_norm": 0.1625223010778427, "learning_rate": 8.745224486403718e-05, "loss": 0.0726, "step": 820 }, { "epoch": 0.6159039759939985, "grad_norm": 0.15932884812355042, "learning_rate": 8.74088329052546e-05, "loss": 0.0691, "step": 821 }, { "epoch": 0.6166541635408852, "grad_norm": 0.17074544727802277, "learning_rate": 8.73653567940073e-05, "loss": 0.0882, "step": 822 }, { "epoch": 0.6174043510877719, "grad_norm": 0.12639515101909637, "learning_rate": 8.732181660485252e-05, "loss": 0.0447, "step": 823 }, { "epoch": 0.6181545386346586, "grad_norm": 0.12108887732028961, "learning_rate": 8.727821241245742e-05, "loss": 0.0424, "step": 824 }, { "epoch": 0.6189047261815454, "grad_norm": 0.1238684207201004, "learning_rate": 8.723454429159888e-05, "loss": 0.0439, "step": 825 }, { "epoch": 0.6196549137284321, "grad_norm": 0.1405922919511795, "learning_rate": 8.719081231716341e-05, "loss": 0.0576, "step": 826 }, { "epoch": 0.6204051012753188, "grad_norm": 0.1311212182044983, "learning_rate": 8.714701656414708e-05, "loss": 0.0449, "step": 827 }, { "epoch": 0.6211552888222055, "grad_norm": 0.19940714538097382, "learning_rate": 8.710315710765526e-05, "loss": 0.077, "step": 828 }, { "epoch": 0.6219054763690923, "grad_norm": 0.1685575693845749, "learning_rate": 8.705923402290261e-05, "loss": 0.0493, "step": 829 }, { "epoch": 0.622655663915979, "grad_norm": 0.10744970291852951, "learning_rate": 8.701524738521291e-05, "loss": 0.0442, "step": 830 }, { "epoch": 0.6234058514628658, "grad_norm": 0.18083858489990234, "learning_rate": 8.697119727001887e-05, "loss": 0.0812, "step": 831 }, { "epoch": 0.6241560390097525, "grad_norm": 0.1295631229877472, "learning_rate": 8.692708375286217e-05, "loss": 0.0601, "step": 832 }, { "epoch": 0.6249062265566392, "grad_norm": 0.1965394765138626, "learning_rate": 8.688290690939307e-05, "loss": 0.0686, "step": 833 }, { "epoch": 0.6256564141035259, "grad_norm": 0.12678660452365875, "learning_rate": 8.683866681537054e-05, "loss": 0.0511, "step": 834 }, { "epoch": 0.6264066016504126, "grad_norm": 0.08941943943500519, "learning_rate": 8.679436354666202e-05, "loss": 0.0408, "step": 835 }, { "epoch": 0.6271567891972993, "grad_norm": 0.15167708694934845, "learning_rate": 8.67499971792432e-05, "loss": 0.0657, "step": 836 }, { "epoch": 0.627906976744186, "grad_norm": 0.15622377395629883, "learning_rate": 8.670556778919805e-05, "loss": 0.0667, "step": 837 }, { "epoch": 0.6286571642910728, "grad_norm": 0.4224050045013428, "learning_rate": 8.666107545271859e-05, "loss": 0.1789, "step": 838 }, { "epoch": 0.6294073518379595, "grad_norm": 0.24056875705718994, "learning_rate": 8.661652024610482e-05, "loss": 0.0992, "step": 839 }, { "epoch": 0.6301575393848462, "grad_norm": 0.10566816478967667, "learning_rate": 8.657190224576453e-05, "loss": 0.0384, "step": 840 }, { "epoch": 0.6309077269317329, "grad_norm": 0.11838135123252869, "learning_rate": 8.652722152821318e-05, "loss": 0.0555, "step": 841 }, { "epoch": 0.6316579144786196, "grad_norm": 0.10628216713666916, "learning_rate": 8.64824781700738e-05, "loss": 0.0542, "step": 842 }, { "epoch": 0.6324081020255063, "grad_norm": 0.17062321305274963, "learning_rate": 8.643767224807685e-05, "loss": 0.0836, "step": 843 }, { "epoch": 0.6331582895723931, "grad_norm": 0.1308564692735672, "learning_rate": 8.639280383906008e-05, "loss": 0.0835, "step": 844 }, { "epoch": 0.6339084771192798, "grad_norm": 0.2822250723838806, "learning_rate": 8.634787301996839e-05, "loss": 0.0664, "step": 845 }, { "epoch": 0.6346586646661665, "grad_norm": 0.1532263457775116, "learning_rate": 8.630287986785368e-05, "loss": 0.0753, "step": 846 }, { "epoch": 0.6354088522130532, "grad_norm": 0.1545627862215042, "learning_rate": 8.625782445987483e-05, "loss": 0.0623, "step": 847 }, { "epoch": 0.63615903975994, "grad_norm": 0.1597648561000824, "learning_rate": 8.621270687329738e-05, "loss": 0.0584, "step": 848 }, { "epoch": 0.6369092273068268, "grad_norm": 0.1512480229139328, "learning_rate": 8.616752718549359e-05, "loss": 0.0859, "step": 849 }, { "epoch": 0.6376594148537135, "grad_norm": 0.11838315427303314, "learning_rate": 8.612228547394215e-05, "loss": 0.0662, "step": 850 }, { "epoch": 0.6384096024006002, "grad_norm": 0.1927739381790161, "learning_rate": 8.607698181622814e-05, "loss": 0.0883, "step": 851 }, { "epoch": 0.6391597899474869, "grad_norm": 0.1669844686985016, "learning_rate": 8.603161629004287e-05, "loss": 0.0869, "step": 852 }, { "epoch": 0.6399099774943736, "grad_norm": 0.16778261959552765, "learning_rate": 8.598618897318375e-05, "loss": 0.0879, "step": 853 }, { "epoch": 0.6406601650412603, "grad_norm": 0.2178831249475479, "learning_rate": 8.594069994355419e-05, "loss": 0.0635, "step": 854 }, { "epoch": 0.641410352588147, "grad_norm": 0.19614070653915405, "learning_rate": 8.589514927916336e-05, "loss": 0.0836, "step": 855 }, { "epoch": 0.6421605401350338, "grad_norm": 0.12226955592632294, "learning_rate": 8.584953705812615e-05, "loss": 0.0714, "step": 856 }, { "epoch": 0.6429107276819205, "grad_norm": 0.1600448340177536, "learning_rate": 8.580386335866308e-05, "loss": 0.0753, "step": 857 }, { "epoch": 0.6436609152288072, "grad_norm": 0.12808331847190857, "learning_rate": 8.575812825909998e-05, "loss": 0.0583, "step": 858 }, { "epoch": 0.6444111027756939, "grad_norm": 0.13148951530456543, "learning_rate": 8.57123318378681e-05, "loss": 0.0575, "step": 859 }, { "epoch": 0.6451612903225806, "grad_norm": 0.13673394918441772, "learning_rate": 8.566647417350378e-05, "loss": 0.064, "step": 860 }, { "epoch": 0.6459114778694673, "grad_norm": 0.8954272866249084, "learning_rate": 8.562055534464838e-05, "loss": 0.0658, "step": 861 }, { "epoch": 0.6466616654163541, "grad_norm": 0.1643034815788269, "learning_rate": 8.557457543004819e-05, "loss": 0.0753, "step": 862 }, { "epoch": 0.6474118529632408, "grad_norm": 0.1936601996421814, "learning_rate": 8.552853450855422e-05, "loss": 0.0955, "step": 863 }, { "epoch": 0.6481620405101275, "grad_norm": 0.11946239322423935, "learning_rate": 8.548243265912213e-05, "loss": 0.0711, "step": 864 }, { "epoch": 0.6489122280570142, "grad_norm": 0.13554371893405914, "learning_rate": 8.543626996081202e-05, "loss": 0.0569, "step": 865 }, { "epoch": 0.6496624156039009, "grad_norm": 0.1824364960193634, "learning_rate": 8.539004649278841e-05, "loss": 0.0683, "step": 866 }, { "epoch": 0.6504126031507877, "grad_norm": 0.11970564723014832, "learning_rate": 8.534376233432e-05, "loss": 0.0503, "step": 867 }, { "epoch": 0.6511627906976745, "grad_norm": 0.16899152100086212, "learning_rate": 8.529741756477953e-05, "loss": 0.0775, "step": 868 }, { "epoch": 0.6519129782445612, "grad_norm": 0.13145393133163452, "learning_rate": 8.525101226364374e-05, "loss": 0.0784, "step": 869 }, { "epoch": 0.6526631657914479, "grad_norm": 0.11247047781944275, "learning_rate": 8.520454651049313e-05, "loss": 0.0493, "step": 870 }, { "epoch": 0.6534133533383346, "grad_norm": 0.1390233337879181, "learning_rate": 8.51580203850119e-05, "loss": 0.0507, "step": 871 }, { "epoch": 0.6541635408852213, "grad_norm": 0.14679768681526184, "learning_rate": 8.511143396698781e-05, "loss": 0.0698, "step": 872 }, { "epoch": 0.654913728432108, "grad_norm": 0.3411470949649811, "learning_rate": 8.506478733631193e-05, "loss": 0.1122, "step": 873 }, { "epoch": 0.6556639159789948, "grad_norm": 0.12128949165344238, "learning_rate": 8.501808057297866e-05, "loss": 0.0519, "step": 874 }, { "epoch": 0.6564141035258815, "grad_norm": 0.12452667206525803, "learning_rate": 8.49713137570855e-05, "loss": 0.0549, "step": 875 }, { "epoch": 0.6571642910727682, "grad_norm": 0.12882107496261597, "learning_rate": 8.492448696883292e-05, "loss": 0.0627, "step": 876 }, { "epoch": 0.6579144786196549, "grad_norm": 0.26537570357322693, "learning_rate": 8.487760028852427e-05, "loss": 0.0543, "step": 877 }, { "epoch": 0.6586646661665416, "grad_norm": 0.1377580612897873, "learning_rate": 8.483065379656558e-05, "loss": 0.0676, "step": 878 }, { "epoch": 0.6594148537134283, "grad_norm": 0.13235914707183838, "learning_rate": 8.478364757346546e-05, "loss": 0.0567, "step": 879 }, { "epoch": 0.660165041260315, "grad_norm": 0.23526223003864288, "learning_rate": 8.473658169983496e-05, "loss": 0.0602, "step": 880 }, { "epoch": 0.6609152288072018, "grad_norm": 0.1625460982322693, "learning_rate": 8.468945625638742e-05, "loss": 0.0723, "step": 881 }, { "epoch": 0.6616654163540885, "grad_norm": 0.19045262038707733, "learning_rate": 8.464227132393831e-05, "loss": 0.0901, "step": 882 }, { "epoch": 0.6624156039009752, "grad_norm": 0.20493094623088837, "learning_rate": 8.459502698340519e-05, "loss": 0.0856, "step": 883 }, { "epoch": 0.6631657914478619, "grad_norm": 0.1073690876364708, "learning_rate": 8.45477233158074e-05, "loss": 0.0397, "step": 884 }, { "epoch": 0.6639159789947486, "grad_norm": 0.12174858152866364, "learning_rate": 8.450036040226612e-05, "loss": 0.0567, "step": 885 }, { "epoch": 0.6646661665416355, "grad_norm": 0.10243424773216248, "learning_rate": 8.445293832400402e-05, "loss": 0.0391, "step": 886 }, { "epoch": 0.6654163540885222, "grad_norm": 0.10561061650514603, "learning_rate": 8.440545716234538e-05, "loss": 0.0341, "step": 887 }, { "epoch": 0.6661665416354089, "grad_norm": 0.09283889830112457, "learning_rate": 8.435791699871564e-05, "loss": 0.0487, "step": 888 }, { "epoch": 0.6669167291822956, "grad_norm": 0.17678828537464142, "learning_rate": 8.431031791464155e-05, "loss": 0.088, "step": 889 }, { "epoch": 0.6676669167291823, "grad_norm": 0.11029431223869324, "learning_rate": 8.426265999175081e-05, "loss": 0.0433, "step": 890 }, { "epoch": 0.668417104276069, "grad_norm": 0.22715550661087036, "learning_rate": 8.421494331177208e-05, "loss": 0.0859, "step": 891 }, { "epoch": 0.6691672918229558, "grad_norm": 0.1496887505054474, "learning_rate": 8.41671679565348e-05, "loss": 0.0584, "step": 892 }, { "epoch": 0.6699174793698425, "grad_norm": 0.20761065185070038, "learning_rate": 8.411933400796896e-05, "loss": 0.0691, "step": 893 }, { "epoch": 0.6706676669167292, "grad_norm": 0.17341768741607666, "learning_rate": 8.407144154810509e-05, "loss": 0.0869, "step": 894 }, { "epoch": 0.6714178544636159, "grad_norm": 0.13700233399868011, "learning_rate": 8.402349065907403e-05, "loss": 0.0644, "step": 895 }, { "epoch": 0.6721680420105026, "grad_norm": 0.19762715697288513, "learning_rate": 8.397548142310685e-05, "loss": 0.07, "step": 896 }, { "epoch": 0.6729182295573893, "grad_norm": 0.1878272444009781, "learning_rate": 8.392741392253465e-05, "loss": 0.0722, "step": 897 }, { "epoch": 0.673668417104276, "grad_norm": 0.18015135824680328, "learning_rate": 8.387928823978846e-05, "loss": 0.0544, "step": 898 }, { "epoch": 0.6744186046511628, "grad_norm": 0.10764296352863312, "learning_rate": 8.383110445739907e-05, "loss": 0.0369, "step": 899 }, { "epoch": 0.6751687921980495, "grad_norm": 0.09914194792509079, "learning_rate": 8.378286265799698e-05, "loss": 0.0452, "step": 900 }, { "epoch": 0.6759189797449362, "grad_norm": 0.17248530685901642, "learning_rate": 8.373456292431206e-05, "loss": 0.0662, "step": 901 }, { "epoch": 0.6766691672918229, "grad_norm": 0.18333736062049866, "learning_rate": 8.368620533917363e-05, "loss": 0.0724, "step": 902 }, { "epoch": 0.6774193548387096, "grad_norm": 0.11491480469703674, "learning_rate": 8.363778998551018e-05, "loss": 0.0645, "step": 903 }, { "epoch": 0.6781695423855963, "grad_norm": 0.13533350825309753, "learning_rate": 8.358931694634928e-05, "loss": 0.0541, "step": 904 }, { "epoch": 0.6789197299324832, "grad_norm": 0.15621417760849, "learning_rate": 8.35407863048174e-05, "loss": 0.0574, "step": 905 }, { "epoch": 0.6796699174793699, "grad_norm": 0.12412711977958679, "learning_rate": 8.349219814413984e-05, "loss": 0.0335, "step": 906 }, { "epoch": 0.6804201050262566, "grad_norm": 0.22609005868434906, "learning_rate": 8.344355254764049e-05, "loss": 0.0816, "step": 907 }, { "epoch": 0.6811702925731433, "grad_norm": 0.1081281304359436, "learning_rate": 8.339484959874178e-05, "loss": 0.0421, "step": 908 }, { "epoch": 0.68192048012003, "grad_norm": 0.15061849355697632, "learning_rate": 8.334608938096443e-05, "loss": 0.06, "step": 909 }, { "epoch": 0.6826706676669168, "grad_norm": 0.15337811410427094, "learning_rate": 8.329727197792744e-05, "loss": 0.0484, "step": 910 }, { "epoch": 0.6834208552138035, "grad_norm": 0.10216069221496582, "learning_rate": 8.324839747334787e-05, "loss": 0.0363, "step": 911 }, { "epoch": 0.6841710427606902, "grad_norm": 0.1537286341190338, "learning_rate": 8.319946595104065e-05, "loss": 0.0459, "step": 912 }, { "epoch": 0.6849212303075769, "grad_norm": 0.14444048702716827, "learning_rate": 8.315047749491851e-05, "loss": 0.0518, "step": 913 }, { "epoch": 0.6856714178544636, "grad_norm": 0.15967465937137604, "learning_rate": 8.310143218899187e-05, "loss": 0.0499, "step": 914 }, { "epoch": 0.6864216054013503, "grad_norm": 0.17263127863407135, "learning_rate": 8.305233011736857e-05, "loss": 0.087, "step": 915 }, { "epoch": 0.687171792948237, "grad_norm": 0.18618431687355042, "learning_rate": 8.300317136425385e-05, "loss": 0.0533, "step": 916 }, { "epoch": 0.6879219804951238, "grad_norm": 0.17359670996665955, "learning_rate": 8.295395601395011e-05, "loss": 0.0653, "step": 917 }, { "epoch": 0.6886721680420105, "grad_norm": 0.14314648509025574, "learning_rate": 8.290468415085683e-05, "loss": 0.0523, "step": 918 }, { "epoch": 0.6894223555888972, "grad_norm": 0.19745318591594696, "learning_rate": 8.285535585947042e-05, "loss": 0.0623, "step": 919 }, { "epoch": 0.6901725431357839, "grad_norm": 0.1895579844713211, "learning_rate": 8.280597122438404e-05, "loss": 0.0668, "step": 920 }, { "epoch": 0.6909227306826706, "grad_norm": 0.17338034510612488, "learning_rate": 8.275653033028745e-05, "loss": 0.0607, "step": 921 }, { "epoch": 0.6916729182295573, "grad_norm": 0.12494196742773056, "learning_rate": 8.270703326196696e-05, "loss": 0.0646, "step": 922 }, { "epoch": 0.6924231057764441, "grad_norm": 0.12629970908164978, "learning_rate": 8.265748010430513e-05, "loss": 0.0576, "step": 923 }, { "epoch": 0.6931732933233309, "grad_norm": 0.14575371146202087, "learning_rate": 8.260787094228076e-05, "loss": 0.0684, "step": 924 }, { "epoch": 0.6939234808702176, "grad_norm": 0.1634964495897293, "learning_rate": 8.255820586096867e-05, "loss": 0.0797, "step": 925 }, { "epoch": 0.6946736684171043, "grad_norm": 0.17009958624839783, "learning_rate": 8.25084849455396e-05, "loss": 0.0808, "step": 926 }, { "epoch": 0.695423855963991, "grad_norm": 0.17264492809772491, "learning_rate": 8.245870828126e-05, "loss": 0.0842, "step": 927 }, { "epoch": 0.6961740435108777, "grad_norm": 0.1700659990310669, "learning_rate": 8.240887595349197e-05, "loss": 0.0652, "step": 928 }, { "epoch": 0.6969242310577645, "grad_norm": 0.16623543202877045, "learning_rate": 8.235898804769303e-05, "loss": 0.0827, "step": 929 }, { "epoch": 0.6976744186046512, "grad_norm": 0.17181211709976196, "learning_rate": 8.230904464941604e-05, "loss": 0.0755, "step": 930 }, { "epoch": 0.6984246061515379, "grad_norm": 0.19417133927345276, "learning_rate": 8.225904584430901e-05, "loss": 0.09, "step": 931 }, { "epoch": 0.6991747936984246, "grad_norm": 0.14830316603183746, "learning_rate": 8.220899171811495e-05, "loss": 0.0559, "step": 932 }, { "epoch": 0.6999249812453113, "grad_norm": 0.11206447333097458, "learning_rate": 8.215888235667176e-05, "loss": 0.0567, "step": 933 }, { "epoch": 0.700675168792198, "grad_norm": 0.15120255947113037, "learning_rate": 8.210871784591207e-05, "loss": 0.059, "step": 934 }, { "epoch": 0.7014253563390848, "grad_norm": 0.152877077460289, "learning_rate": 8.205849827186308e-05, "loss": 0.0574, "step": 935 }, { "epoch": 0.7021755438859715, "grad_norm": 0.1164332926273346, "learning_rate": 8.200822372064641e-05, "loss": 0.0586, "step": 936 }, { "epoch": 0.7029257314328582, "grad_norm": 0.17080263793468475, "learning_rate": 8.195789427847796e-05, "loss": 0.0597, "step": 937 }, { "epoch": 0.7036759189797449, "grad_norm": 0.0957210585474968, "learning_rate": 8.190751003166778e-05, "loss": 0.039, "step": 938 }, { "epoch": 0.7044261065266316, "grad_norm": 0.15183965861797333, "learning_rate": 8.185707106661986e-05, "loss": 0.0742, "step": 939 }, { "epoch": 0.7051762940735183, "grad_norm": 0.15807807445526123, "learning_rate": 8.18065774698321e-05, "loss": 0.0585, "step": 940 }, { "epoch": 0.705926481620405, "grad_norm": 0.16685016453266144, "learning_rate": 8.175602932789601e-05, "loss": 0.0584, "step": 941 }, { "epoch": 0.7066766691672918, "grad_norm": 0.1323295682668686, "learning_rate": 8.17054267274967e-05, "loss": 0.0785, "step": 942 }, { "epoch": 0.7074268567141786, "grad_norm": 0.29115551710128784, "learning_rate": 8.165476975541264e-05, "loss": 0.0696, "step": 943 }, { "epoch": 0.7081770442610653, "grad_norm": 0.16488604247570038, "learning_rate": 8.160405849851556e-05, "loss": 0.0558, "step": 944 }, { "epoch": 0.708927231807952, "grad_norm": 0.16088944673538208, "learning_rate": 8.155329304377025e-05, "loss": 0.0616, "step": 945 }, { "epoch": 0.7096774193548387, "grad_norm": 0.17655105888843536, "learning_rate": 8.150247347823448e-05, "loss": 0.0749, "step": 946 }, { "epoch": 0.7104276069017255, "grad_norm": 0.15837235748767853, "learning_rate": 8.145159988905879e-05, "loss": 0.081, "step": 947 }, { "epoch": 0.7111777944486122, "grad_norm": 0.18028263747692108, "learning_rate": 8.140067236348638e-05, "loss": 0.0801, "step": 948 }, { "epoch": 0.7119279819954989, "grad_norm": 0.20195040106773376, "learning_rate": 8.134969098885294e-05, "loss": 0.1123, "step": 949 }, { "epoch": 0.7126781695423856, "grad_norm": 0.16333803534507751, "learning_rate": 8.129865585258653e-05, "loss": 0.0804, "step": 950 }, { "epoch": 0.7134283570892723, "grad_norm": 0.15731093287467957, "learning_rate": 8.124756704220735e-05, "loss": 0.0696, "step": 951 }, { "epoch": 0.714178544636159, "grad_norm": 0.1764248162508011, "learning_rate": 8.11964246453277e-05, "loss": 0.0805, "step": 952 }, { "epoch": 0.7149287321830458, "grad_norm": 0.17285150289535522, "learning_rate": 8.114522874965174e-05, "loss": 0.074, "step": 953 }, { "epoch": 0.7156789197299325, "grad_norm": 0.22047369182109833, "learning_rate": 8.10939794429754e-05, "loss": 0.0731, "step": 954 }, { "epoch": 0.7164291072768192, "grad_norm": 0.20119711756706238, "learning_rate": 8.10426768131862e-05, "loss": 0.0794, "step": 955 }, { "epoch": 0.7171792948237059, "grad_norm": 0.10244588553905487, "learning_rate": 8.099132094826308e-05, "loss": 0.0612, "step": 956 }, { "epoch": 0.7179294823705926, "grad_norm": 0.14183086156845093, "learning_rate": 8.093991193627631e-05, "loss": 0.0849, "step": 957 }, { "epoch": 0.7186796699174793, "grad_norm": 0.13276663422584534, "learning_rate": 8.088844986538727e-05, "loss": 0.0511, "step": 958 }, { "epoch": 0.719429857464366, "grad_norm": 0.1904866099357605, "learning_rate": 8.083693482384836e-05, "loss": 0.0661, "step": 959 }, { "epoch": 0.7201800450112528, "grad_norm": 0.20430415868759155, "learning_rate": 8.078536690000278e-05, "loss": 0.0936, "step": 960 }, { "epoch": 0.7209302325581395, "grad_norm": 0.13531947135925293, "learning_rate": 8.073374618228445e-05, "loss": 0.0594, "step": 961 }, { "epoch": 0.7216804201050263, "grad_norm": 0.14323526620864868, "learning_rate": 8.068207275921782e-05, "loss": 0.0659, "step": 962 }, { "epoch": 0.722430607651913, "grad_norm": 0.12277992069721222, "learning_rate": 8.063034671941774e-05, "loss": 0.0493, "step": 963 }, { "epoch": 0.7231807951987997, "grad_norm": 0.19475075602531433, "learning_rate": 8.057856815158924e-05, "loss": 0.0737, "step": 964 }, { "epoch": 0.7239309827456865, "grad_norm": 0.11780892312526703, "learning_rate": 8.05267371445275e-05, "loss": 0.0576, "step": 965 }, { "epoch": 0.7246811702925732, "grad_norm": 0.13752034306526184, "learning_rate": 8.047485378711756e-05, "loss": 0.0412, "step": 966 }, { "epoch": 0.7254313578394599, "grad_norm": 0.1040201336145401, "learning_rate": 8.042291816833429e-05, "loss": 0.0415, "step": 967 }, { "epoch": 0.7261815453863466, "grad_norm": 0.22411030530929565, "learning_rate": 8.037093037724216e-05, "loss": 0.0966, "step": 968 }, { "epoch": 0.7269317329332333, "grad_norm": 0.1516868621110916, "learning_rate": 8.031889050299511e-05, "loss": 0.0643, "step": 969 }, { "epoch": 0.72768192048012, "grad_norm": 0.18635967373847961, "learning_rate": 8.02667986348364e-05, "loss": 0.1136, "step": 970 }, { "epoch": 0.7284321080270068, "grad_norm": 0.16178272664546967, "learning_rate": 8.021465486209846e-05, "loss": 0.0615, "step": 971 }, { "epoch": 0.7291822955738935, "grad_norm": 0.15400740504264832, "learning_rate": 8.016245927420272e-05, "loss": 0.083, "step": 972 }, { "epoch": 0.7299324831207802, "grad_norm": 0.1883528083562851, "learning_rate": 8.011021196065946e-05, "loss": 0.0876, "step": 973 }, { "epoch": 0.7306826706676669, "grad_norm": 0.19876441359519958, "learning_rate": 8.005791301106769e-05, "loss": 0.088, "step": 974 }, { "epoch": 0.7314328582145536, "grad_norm": 0.1295093595981598, "learning_rate": 8.000556251511498e-05, "loss": 0.062, "step": 975 }, { "epoch": 0.7321830457614403, "grad_norm": 0.11591675877571106, "learning_rate": 7.995316056257723e-05, "loss": 0.0481, "step": 976 }, { "epoch": 0.732933233308327, "grad_norm": 0.13371731340885162, "learning_rate": 7.990070724331866e-05, "loss": 0.0555, "step": 977 }, { "epoch": 0.7336834208552138, "grad_norm": 0.18042734265327454, "learning_rate": 7.984820264729156e-05, "loss": 0.0673, "step": 978 }, { "epoch": 0.7344336084021005, "grad_norm": 0.1305835098028183, "learning_rate": 7.979564686453612e-05, "loss": 0.0699, "step": 979 }, { "epoch": 0.7351837959489872, "grad_norm": 0.1295088827610016, "learning_rate": 7.974303998518031e-05, "loss": 0.0522, "step": 980 }, { "epoch": 0.735933983495874, "grad_norm": 0.16895820200443268, "learning_rate": 7.96903820994398e-05, "loss": 0.0591, "step": 981 }, { "epoch": 0.7366841710427607, "grad_norm": 0.22246110439300537, "learning_rate": 7.963767329761762e-05, "loss": 0.1312, "step": 982 }, { "epoch": 0.7374343585896475, "grad_norm": 0.1366637945175171, "learning_rate": 7.958491367010423e-05, "loss": 0.0702, "step": 983 }, { "epoch": 0.7381845461365342, "grad_norm": 0.13772538304328918, "learning_rate": 7.953210330737718e-05, "loss": 0.0679, "step": 984 }, { "epoch": 0.7389347336834209, "grad_norm": 0.16669201850891113, "learning_rate": 7.947924230000102e-05, "loss": 0.0571, "step": 985 }, { "epoch": 0.7396849212303076, "grad_norm": 0.1642238348722458, "learning_rate": 7.942633073862718e-05, "loss": 0.0709, "step": 986 }, { "epoch": 0.7404351087771943, "grad_norm": 0.22212821245193481, "learning_rate": 7.937336871399379e-05, "loss": 0.0779, "step": 987 }, { "epoch": 0.741185296324081, "grad_norm": 0.19373202323913574, "learning_rate": 7.932035631692549e-05, "loss": 0.074, "step": 988 }, { "epoch": 0.7419354838709677, "grad_norm": 0.19060571491718292, "learning_rate": 7.926729363833335e-05, "loss": 0.0745, "step": 989 }, { "epoch": 0.7426856714178545, "grad_norm": 0.19428344070911407, "learning_rate": 7.921418076921461e-05, "loss": 0.078, "step": 990 }, { "epoch": 0.7434358589647412, "grad_norm": 0.1412833333015442, "learning_rate": 7.916101780065263e-05, "loss": 0.0634, "step": 991 }, { "epoch": 0.7441860465116279, "grad_norm": 0.11580026149749756, "learning_rate": 7.910780482381665e-05, "loss": 0.0496, "step": 992 }, { "epoch": 0.7449362340585146, "grad_norm": 0.14477257430553436, "learning_rate": 7.905454192996169e-05, "loss": 0.066, "step": 993 }, { "epoch": 0.7456864216054013, "grad_norm": 0.19042307138442993, "learning_rate": 7.900122921042837e-05, "loss": 0.0673, "step": 994 }, { "epoch": 0.746436609152288, "grad_norm": 0.18369849026203156, "learning_rate": 7.894786675664277e-05, "loss": 0.0697, "step": 995 }, { "epoch": 0.7471867966991748, "grad_norm": 0.17455372214317322, "learning_rate": 7.88944546601162e-05, "loss": 0.0833, "step": 996 }, { "epoch": 0.7479369842460615, "grad_norm": 0.11367595195770264, "learning_rate": 7.884099301244519e-05, "loss": 0.0459, "step": 997 }, { "epoch": 0.7486871717929482, "grad_norm": 0.09311968833208084, "learning_rate": 7.878748190531118e-05, "loss": 0.0397, "step": 998 }, { "epoch": 0.7494373593398349, "grad_norm": 0.14789381623268127, "learning_rate": 7.873392143048047e-05, "loss": 0.061, "step": 999 }, { "epoch": 0.7501875468867217, "grad_norm": 0.18568088114261627, "learning_rate": 7.868031167980397e-05, "loss": 0.1057, "step": 1000 }, { "epoch": 0.7501875468867217, "eval_loss": 0.06887388974428177, "eval_runtime": 2.654, "eval_samples_per_second": 20.347, "eval_steps_per_second": 5.275, "step": 1000 }, { "epoch": 0.7509377344336085, "grad_norm": 0.1491386443376541, "learning_rate": 7.862665274521712e-05, "loss": 0.0619, "step": 1001 }, { "epoch": 0.7516879219804952, "grad_norm": 0.1528184562921524, "learning_rate": 7.857294471873975e-05, "loss": 0.0653, "step": 1002 }, { "epoch": 0.7524381095273819, "grad_norm": 0.16220131516456604, "learning_rate": 7.851918769247582e-05, "loss": 0.0813, "step": 1003 }, { "epoch": 0.7531882970742686, "grad_norm": 0.1285926252603531, "learning_rate": 7.846538175861332e-05, "loss": 0.048, "step": 1004 }, { "epoch": 0.7539384846211553, "grad_norm": 0.12812086939811707, "learning_rate": 7.841152700942413e-05, "loss": 0.0726, "step": 1005 }, { "epoch": 0.754688672168042, "grad_norm": 0.17180190980434418, "learning_rate": 7.835762353726386e-05, "loss": 0.0835, "step": 1006 }, { "epoch": 0.7554388597149287, "grad_norm": 0.18833868205547333, "learning_rate": 7.830367143457165e-05, "loss": 0.0819, "step": 1007 }, { "epoch": 0.7561890472618155, "grad_norm": 0.36495018005371094, "learning_rate": 7.824967079387002e-05, "loss": 0.0986, "step": 1008 }, { "epoch": 0.7569392348087022, "grad_norm": 0.1798129677772522, "learning_rate": 7.81956217077648e-05, "loss": 0.0735, "step": 1009 }, { "epoch": 0.7576894223555889, "grad_norm": 0.12408173084259033, "learning_rate": 7.814152426894478e-05, "loss": 0.0545, "step": 1010 }, { "epoch": 0.7584396099024756, "grad_norm": 0.1563502848148346, "learning_rate": 7.808737857018182e-05, "loss": 0.0817, "step": 1011 }, { "epoch": 0.7591897974493623, "grad_norm": 0.1812286674976349, "learning_rate": 7.803318470433042e-05, "loss": 0.0619, "step": 1012 }, { "epoch": 0.759939984996249, "grad_norm": 0.12106587737798691, "learning_rate": 7.797894276432772e-05, "loss": 0.0651, "step": 1013 }, { "epoch": 0.7606901725431358, "grad_norm": 0.13680601119995117, "learning_rate": 7.792465284319332e-05, "loss": 0.0692, "step": 1014 }, { "epoch": 0.7614403600900225, "grad_norm": 0.13508430123329163, "learning_rate": 7.787031503402907e-05, "loss": 0.0568, "step": 1015 }, { "epoch": 0.7621905476369092, "grad_norm": 0.17294204235076904, "learning_rate": 7.781592943001899e-05, "loss": 0.0633, "step": 1016 }, { "epoch": 0.7629407351837959, "grad_norm": 0.12395866960287094, "learning_rate": 7.776149612442899e-05, "loss": 0.0702, "step": 1017 }, { "epoch": 0.7636909227306826, "grad_norm": 0.13402384519577026, "learning_rate": 7.770701521060688e-05, "loss": 0.0554, "step": 1018 }, { "epoch": 0.7644411102775694, "grad_norm": 0.11290405690670013, "learning_rate": 7.765248678198203e-05, "loss": 0.0429, "step": 1019 }, { "epoch": 0.7651912978244562, "grad_norm": 0.12147333472967148, "learning_rate": 7.759791093206534e-05, "loss": 0.0487, "step": 1020 }, { "epoch": 0.7659414853713429, "grad_norm": 0.13817934691905975, "learning_rate": 7.754328775444903e-05, "loss": 0.0625, "step": 1021 }, { "epoch": 0.7666916729182296, "grad_norm": 0.19256222248077393, "learning_rate": 7.748861734280643e-05, "loss": 0.079, "step": 1022 }, { "epoch": 0.7674418604651163, "grad_norm": 0.1472809612751007, "learning_rate": 7.743389979089196e-05, "loss": 0.0509, "step": 1023 }, { "epoch": 0.768192048012003, "grad_norm": 0.1204993948340416, "learning_rate": 7.737913519254079e-05, "loss": 0.0549, "step": 1024 }, { "epoch": 0.7689422355588897, "grad_norm": 0.22745735943317413, "learning_rate": 7.732432364166884e-05, "loss": 0.1078, "step": 1025 }, { "epoch": 0.7696924231057765, "grad_norm": 0.26715755462646484, "learning_rate": 7.726946523227251e-05, "loss": 0.0708, "step": 1026 }, { "epoch": 0.7704426106526632, "grad_norm": 0.14105719327926636, "learning_rate": 7.721456005842861e-05, "loss": 0.0634, "step": 1027 }, { "epoch": 0.7711927981995499, "grad_norm": 0.13659711182117462, "learning_rate": 7.715960821429404e-05, "loss": 0.0417, "step": 1028 }, { "epoch": 0.7719429857464366, "grad_norm": 0.16884845495224, "learning_rate": 7.710460979410585e-05, "loss": 0.0621, "step": 1029 }, { "epoch": 0.7726931732933233, "grad_norm": 0.20827940106391907, "learning_rate": 7.704956489218091e-05, "loss": 0.072, "step": 1030 }, { "epoch": 0.77344336084021, "grad_norm": 0.19093617796897888, "learning_rate": 7.699447360291576e-05, "loss": 0.0896, "step": 1031 }, { "epoch": 0.7741935483870968, "grad_norm": 0.14019818603992462, "learning_rate": 7.69393360207866e-05, "loss": 0.0437, "step": 1032 }, { "epoch": 0.7749437359339835, "grad_norm": 0.14744336903095245, "learning_rate": 7.688415224034893e-05, "loss": 0.056, "step": 1033 }, { "epoch": 0.7756939234808702, "grad_norm": 0.13031582534313202, "learning_rate": 7.682892235623749e-05, "loss": 0.0438, "step": 1034 }, { "epoch": 0.7764441110277569, "grad_norm": 0.10287275165319443, "learning_rate": 7.67736464631661e-05, "loss": 0.045, "step": 1035 }, { "epoch": 0.7771942985746436, "grad_norm": 0.16352872550487518, "learning_rate": 7.671832465592746e-05, "loss": 0.0718, "step": 1036 }, { "epoch": 0.7779444861215303, "grad_norm": 0.1713312715291977, "learning_rate": 7.666295702939305e-05, "loss": 0.0713, "step": 1037 }, { "epoch": 0.7786946736684172, "grad_norm": 0.18730027973651886, "learning_rate": 7.660754367851286e-05, "loss": 0.0698, "step": 1038 }, { "epoch": 0.7794448612153039, "grad_norm": 0.1919287145137787, "learning_rate": 7.655208469831536e-05, "loss": 0.0805, "step": 1039 }, { "epoch": 0.7801950487621906, "grad_norm": 0.20845071971416473, "learning_rate": 7.649658018390725e-05, "loss": 0.0723, "step": 1040 }, { "epoch": 0.7809452363090773, "grad_norm": 0.10697642713785172, "learning_rate": 7.644103023047327e-05, "loss": 0.0501, "step": 1041 }, { "epoch": 0.781695423855964, "grad_norm": 0.14027732610702515, "learning_rate": 7.638543493327613e-05, "loss": 0.0436, "step": 1042 }, { "epoch": 0.7824456114028507, "grad_norm": 0.15785855054855347, "learning_rate": 7.63297943876563e-05, "loss": 0.0549, "step": 1043 }, { "epoch": 0.7831957989497375, "grad_norm": 0.2016582190990448, "learning_rate": 7.627410868903184e-05, "loss": 0.07, "step": 1044 }, { "epoch": 0.7839459864966242, "grad_norm": 0.1890334188938141, "learning_rate": 7.621837793289824e-05, "loss": 0.0719, "step": 1045 }, { "epoch": 0.7846961740435109, "grad_norm": 0.1692233830690384, "learning_rate": 7.616260221482825e-05, "loss": 0.0546, "step": 1046 }, { "epoch": 0.7854463615903976, "grad_norm": 0.20178622007369995, "learning_rate": 7.610678163047174e-05, "loss": 0.0839, "step": 1047 }, { "epoch": 0.7861965491372843, "grad_norm": 0.12947288155555725, "learning_rate": 7.60509162755555e-05, "loss": 0.0592, "step": 1048 }, { "epoch": 0.786946736684171, "grad_norm": 0.1551869809627533, "learning_rate": 7.599500624588314e-05, "loss": 0.0643, "step": 1049 }, { "epoch": 0.7876969242310577, "grad_norm": 0.18167535960674286, "learning_rate": 7.593905163733484e-05, "loss": 0.0823, "step": 1050 }, { "epoch": 0.7884471117779445, "grad_norm": 0.18511007726192474, "learning_rate": 7.588305254586724e-05, "loss": 0.0805, "step": 1051 }, { "epoch": 0.7891972993248312, "grad_norm": 0.1279374212026596, "learning_rate": 7.582700906751325e-05, "loss": 0.0564, "step": 1052 }, { "epoch": 0.7899474868717179, "grad_norm": 0.1779521256685257, "learning_rate": 7.577092129838197e-05, "loss": 0.0659, "step": 1053 }, { "epoch": 0.7906976744186046, "grad_norm": 0.1136535033583641, "learning_rate": 7.571478933465836e-05, "loss": 0.0548, "step": 1054 }, { "epoch": 0.7914478619654913, "grad_norm": 0.17618438601493835, "learning_rate": 7.565861327260322e-05, "loss": 0.0725, "step": 1055 }, { "epoch": 0.792198049512378, "grad_norm": 0.23887811601161957, "learning_rate": 7.560239320855296e-05, "loss": 0.0956, "step": 1056 }, { "epoch": 0.7929482370592649, "grad_norm": 0.24576206505298615, "learning_rate": 7.554612923891946e-05, "loss": 0.1265, "step": 1057 }, { "epoch": 0.7936984246061516, "grad_norm": 0.17981268465518951, "learning_rate": 7.548982146018988e-05, "loss": 0.0779, "step": 1058 }, { "epoch": 0.7944486121530383, "grad_norm": 0.14327049255371094, "learning_rate": 7.543346996892654e-05, "loss": 0.0781, "step": 1059 }, { "epoch": 0.795198799699925, "grad_norm": 0.17094387114048004, "learning_rate": 7.537707486176667e-05, "loss": 0.0614, "step": 1060 }, { "epoch": 0.7959489872468117, "grad_norm": 0.10642191767692566, "learning_rate": 7.532063623542231e-05, "loss": 0.0572, "step": 1061 }, { "epoch": 0.7966991747936985, "grad_norm": 0.13238154351711273, "learning_rate": 7.52641541866802e-05, "loss": 0.0453, "step": 1062 }, { "epoch": 0.7974493623405852, "grad_norm": 0.19573621451854706, "learning_rate": 7.520762881240147e-05, "loss": 0.1012, "step": 1063 }, { "epoch": 0.7981995498874719, "grad_norm": 0.1620679497718811, "learning_rate": 7.515106020952156e-05, "loss": 0.0721, "step": 1064 }, { "epoch": 0.7989497374343586, "grad_norm": 0.1418398767709732, "learning_rate": 7.509444847505005e-05, "loss": 0.0822, "step": 1065 }, { "epoch": 0.7996999249812453, "grad_norm": 0.14470888674259186, "learning_rate": 7.503779370607049e-05, "loss": 0.0582, "step": 1066 }, { "epoch": 0.800450112528132, "grad_norm": 0.12305848300457001, "learning_rate": 7.498109599974024e-05, "loss": 0.0615, "step": 1067 }, { "epoch": 0.8012003000750187, "grad_norm": 0.12674108147621155, "learning_rate": 7.49243554532903e-05, "loss": 0.0693, "step": 1068 }, { "epoch": 0.8019504876219055, "grad_norm": 0.14826877415180206, "learning_rate": 7.486757216402509e-05, "loss": 0.0548, "step": 1069 }, { "epoch": 0.8027006751687922, "grad_norm": 0.12875312566757202, "learning_rate": 7.481074622932236e-05, "loss": 0.0556, "step": 1070 }, { "epoch": 0.8034508627156789, "grad_norm": 0.15808606147766113, "learning_rate": 7.475387774663302e-05, "loss": 0.0498, "step": 1071 }, { "epoch": 0.8042010502625656, "grad_norm": 0.11491523683071136, "learning_rate": 7.469696681348088e-05, "loss": 0.0487, "step": 1072 }, { "epoch": 0.8049512378094523, "grad_norm": 0.20159392058849335, "learning_rate": 7.464001352746263e-05, "loss": 0.1028, "step": 1073 }, { "epoch": 0.805701425356339, "grad_norm": 0.16383297741413116, "learning_rate": 7.45830179862475e-05, "loss": 0.0681, "step": 1074 }, { "epoch": 0.8064516129032258, "grad_norm": 0.12068310379981995, "learning_rate": 7.452598028757729e-05, "loss": 0.0816, "step": 1075 }, { "epoch": 0.8072018004501126, "grad_norm": 0.2644040584564209, "learning_rate": 7.446890052926598e-05, "loss": 0.0731, "step": 1076 }, { "epoch": 0.8079519879969993, "grad_norm": 0.13366550207138062, "learning_rate": 7.441177880919976e-05, "loss": 0.0596, "step": 1077 }, { "epoch": 0.808702175543886, "grad_norm": 0.22289660573005676, "learning_rate": 7.435461522533674e-05, "loss": 0.0567, "step": 1078 }, { "epoch": 0.8094523630907727, "grad_norm": 0.13923311233520508, "learning_rate": 7.429740987570686e-05, "loss": 0.0651, "step": 1079 }, { "epoch": 0.8102025506376594, "grad_norm": 0.15409795939922333, "learning_rate": 7.424016285841165e-05, "loss": 0.0556, "step": 1080 }, { "epoch": 0.8109527381845462, "grad_norm": 0.1327374130487442, "learning_rate": 7.41828742716241e-05, "loss": 0.0611, "step": 1081 }, { "epoch": 0.8117029257314329, "grad_norm": 0.16146451234817505, "learning_rate": 7.41255442135885e-05, "loss": 0.0681, "step": 1082 }, { "epoch": 0.8124531132783196, "grad_norm": 0.22603389620780945, "learning_rate": 7.406817278262027e-05, "loss": 0.0843, "step": 1083 }, { "epoch": 0.8132033008252063, "grad_norm": 0.11806265264749527, "learning_rate": 7.401076007710575e-05, "loss": 0.0535, "step": 1084 }, { "epoch": 0.813953488372093, "grad_norm": 0.1646219938993454, "learning_rate": 7.395330619550207e-05, "loss": 0.0631, "step": 1085 }, { "epoch": 0.8147036759189797, "grad_norm": 0.22715872526168823, "learning_rate": 7.3895811236337e-05, "loss": 0.0812, "step": 1086 }, { "epoch": 0.8154538634658665, "grad_norm": 0.24135173857212067, "learning_rate": 7.38382752982087e-05, "loss": 0.0916, "step": 1087 }, { "epoch": 0.8162040510127532, "grad_norm": 0.22600874304771423, "learning_rate": 7.378069847978568e-05, "loss": 0.0794, "step": 1088 }, { "epoch": 0.8169542385596399, "grad_norm": 0.21018345654010773, "learning_rate": 7.372308087980647e-05, "loss": 0.0811, "step": 1089 }, { "epoch": 0.8177044261065266, "grad_norm": 0.17604215443134308, "learning_rate": 7.366542259707962e-05, "loss": 0.058, "step": 1090 }, { "epoch": 0.8184546136534133, "grad_norm": 0.1911773532629013, "learning_rate": 7.360772373048338e-05, "loss": 0.0716, "step": 1091 }, { "epoch": 0.8192048012003, "grad_norm": 0.15144562721252441, "learning_rate": 7.354998437896565e-05, "loss": 0.058, "step": 1092 }, { "epoch": 0.8199549887471868, "grad_norm": 0.16863825917243958, "learning_rate": 7.349220464154371e-05, "loss": 0.0696, "step": 1093 }, { "epoch": 0.8207051762940735, "grad_norm": 0.11625709384679794, "learning_rate": 7.343438461730411e-05, "loss": 0.0497, "step": 1094 }, { "epoch": 0.8214553638409603, "grad_norm": 0.19467462599277496, "learning_rate": 7.337652440540252e-05, "loss": 0.0994, "step": 1095 }, { "epoch": 0.822205551387847, "grad_norm": 0.24418720602989197, "learning_rate": 7.331862410506353e-05, "loss": 0.0856, "step": 1096 }, { "epoch": 0.8229557389347337, "grad_norm": 0.14274224638938904, "learning_rate": 7.32606838155804e-05, "loss": 0.0516, "step": 1097 }, { "epoch": 0.8237059264816204, "grad_norm": 0.14197169244289398, "learning_rate": 7.320270363631505e-05, "loss": 0.0686, "step": 1098 }, { "epoch": 0.8244561140285072, "grad_norm": 0.17585918307304382, "learning_rate": 7.314468366669777e-05, "loss": 0.0778, "step": 1099 }, { "epoch": 0.8252063015753939, "grad_norm": 0.1533975601196289, "learning_rate": 7.30866240062271e-05, "loss": 0.0554, "step": 1100 }, { "epoch": 0.8259564891222806, "grad_norm": 0.21065160632133484, "learning_rate": 7.302852475446963e-05, "loss": 0.0919, "step": 1101 }, { "epoch": 0.8267066766691673, "grad_norm": 0.1751699149608612, "learning_rate": 7.297038601105988e-05, "loss": 0.0765, "step": 1102 }, { "epoch": 0.827456864216054, "grad_norm": 0.16724807024002075, "learning_rate": 7.291220787570005e-05, "loss": 0.0703, "step": 1103 }, { "epoch": 0.8282070517629407, "grad_norm": 0.16113238036632538, "learning_rate": 7.28539904481599e-05, "loss": 0.0718, "step": 1104 }, { "epoch": 0.8289572393098275, "grad_norm": 0.12530259788036346, "learning_rate": 7.279573382827662e-05, "loss": 0.0498, "step": 1105 }, { "epoch": 0.8297074268567142, "grad_norm": 0.13468191027641296, "learning_rate": 7.273743811595454e-05, "loss": 0.0514, "step": 1106 }, { "epoch": 0.8304576144036009, "grad_norm": 0.13877612352371216, "learning_rate": 7.267910341116512e-05, "loss": 0.0711, "step": 1107 }, { "epoch": 0.8312078019504876, "grad_norm": 0.11012528836727142, "learning_rate": 7.262072981394656e-05, "loss": 0.0458, "step": 1108 }, { "epoch": 0.8319579894973743, "grad_norm": 0.22262486815452576, "learning_rate": 7.256231742440389e-05, "loss": 0.1182, "step": 1109 }, { "epoch": 0.832708177044261, "grad_norm": 0.15494659543037415, "learning_rate": 7.25038663427086e-05, "loss": 0.0766, "step": 1110 }, { "epoch": 0.8334583645911477, "grad_norm": 0.14093343913555145, "learning_rate": 7.24453766690985e-05, "loss": 0.056, "step": 1111 }, { "epoch": 0.8342085521380345, "grad_norm": 0.15565334260463715, "learning_rate": 7.238684850387765e-05, "loss": 0.0565, "step": 1112 }, { "epoch": 0.8349587396849212, "grad_norm": 0.13291604816913605, "learning_rate": 7.232828194741611e-05, "loss": 0.0615, "step": 1113 }, { "epoch": 0.8357089272318079, "grad_norm": 0.1696081906557083, "learning_rate": 7.226967710014971e-05, "loss": 0.0601, "step": 1114 }, { "epoch": 0.8364591147786947, "grad_norm": 0.1792924702167511, "learning_rate": 7.221103406258003e-05, "loss": 0.0558, "step": 1115 }, { "epoch": 0.8372093023255814, "grad_norm": 0.11232032626867294, "learning_rate": 7.215235293527409e-05, "loss": 0.0473, "step": 1116 }, { "epoch": 0.8379594898724682, "grad_norm": 0.15998299419879913, "learning_rate": 7.209363381886423e-05, "loss": 0.0799, "step": 1117 }, { "epoch": 0.8387096774193549, "grad_norm": 0.12312288582324982, "learning_rate": 7.203487681404798e-05, "loss": 0.0385, "step": 1118 }, { "epoch": 0.8394598649662416, "grad_norm": 0.14762242138385773, "learning_rate": 7.19760820215878e-05, "loss": 0.0788, "step": 1119 }, { "epoch": 0.8402100525131283, "grad_norm": 0.15391835570335388, "learning_rate": 7.191724954231098e-05, "loss": 0.0556, "step": 1120 }, { "epoch": 0.840960240060015, "grad_norm": 0.1032281443476677, "learning_rate": 7.185837947710943e-05, "loss": 0.0601, "step": 1121 }, { "epoch": 0.8417104276069017, "grad_norm": 0.11759169399738312, "learning_rate": 7.17994719269395e-05, "loss": 0.0548, "step": 1122 }, { "epoch": 0.8424606151537885, "grad_norm": 0.15175266563892365, "learning_rate": 7.174052699282183e-05, "loss": 0.0829, "step": 1123 }, { "epoch": 0.8432108027006752, "grad_norm": 0.26938396692276, "learning_rate": 7.168154477584123e-05, "loss": 0.077, "step": 1124 }, { "epoch": 0.8439609902475619, "grad_norm": 0.13639144599437714, "learning_rate": 7.162252537714633e-05, "loss": 0.0516, "step": 1125 }, { "epoch": 0.8447111777944486, "grad_norm": 0.14507851004600525, "learning_rate": 7.156346889794962e-05, "loss": 0.0916, "step": 1126 }, { "epoch": 0.8454613653413353, "grad_norm": 0.21296165883541107, "learning_rate": 7.150437543952715e-05, "loss": 0.0742, "step": 1127 }, { "epoch": 0.846211552888222, "grad_norm": 0.09235338866710663, "learning_rate": 7.144524510321837e-05, "loss": 0.0501, "step": 1128 }, { "epoch": 0.8469617404351087, "grad_norm": 0.19731169939041138, "learning_rate": 7.138607799042598e-05, "loss": 0.0772, "step": 1129 }, { "epoch": 0.8477119279819955, "grad_norm": 0.1795894056558609, "learning_rate": 7.132687420261576e-05, "loss": 0.0757, "step": 1130 }, { "epoch": 0.8484621155288822, "grad_norm": 0.14649488031864166, "learning_rate": 7.126763384131638e-05, "loss": 0.055, "step": 1131 }, { "epoch": 0.8492123030757689, "grad_norm": 0.12544019520282745, "learning_rate": 7.120835700811923e-05, "loss": 0.0734, "step": 1132 }, { "epoch": 0.8499624906226556, "grad_norm": 0.1415441930294037, "learning_rate": 7.114904380467823e-05, "loss": 0.0542, "step": 1133 }, { "epoch": 0.8507126781695424, "grad_norm": 0.12028104066848755, "learning_rate": 7.108969433270968e-05, "loss": 0.0491, "step": 1134 }, { "epoch": 0.8514628657164292, "grad_norm": 0.12645025551319122, "learning_rate": 7.10303086939921e-05, "loss": 0.0677, "step": 1135 }, { "epoch": 0.8522130532633159, "grad_norm": 0.12939628958702087, "learning_rate": 7.097088699036598e-05, "loss": 0.089, "step": 1136 }, { "epoch": 0.8529632408102026, "grad_norm": 0.14228011667728424, "learning_rate": 7.091142932373371e-05, "loss": 0.066, "step": 1137 }, { "epoch": 0.8537134283570893, "grad_norm": 0.14560295641422272, "learning_rate": 7.085193579605935e-05, "loss": 0.0475, "step": 1138 }, { "epoch": 0.854463615903976, "grad_norm": 0.09960725158452988, "learning_rate": 7.079240650936843e-05, "loss": 0.0587, "step": 1139 }, { "epoch": 0.8552138034508627, "grad_norm": 0.13683639466762543, "learning_rate": 7.07328415657478e-05, "loss": 0.0502, "step": 1140 }, { "epoch": 0.8559639909977494, "grad_norm": 0.17216654121875763, "learning_rate": 7.067324106734548e-05, "loss": 0.0559, "step": 1141 }, { "epoch": 0.8567141785446362, "grad_norm": 0.16215908527374268, "learning_rate": 7.061360511637045e-05, "loss": 0.0453, "step": 1142 }, { "epoch": 0.8574643660915229, "grad_norm": 0.13577274978160858, "learning_rate": 7.055393381509253e-05, "loss": 0.0711, "step": 1143 }, { "epoch": 0.8582145536384096, "grad_norm": 0.14832454919815063, "learning_rate": 7.049422726584206e-05, "loss": 0.0644, "step": 1144 }, { "epoch": 0.8589647411852963, "grad_norm": 0.17870298027992249, "learning_rate": 7.043448557100995e-05, "loss": 0.0892, "step": 1145 }, { "epoch": 0.859714928732183, "grad_norm": 0.13199222087860107, "learning_rate": 7.037470883304731e-05, "loss": 0.0417, "step": 1146 }, { "epoch": 0.8604651162790697, "grad_norm": 0.11214780807495117, "learning_rate": 7.031489715446535e-05, "loss": 0.0474, "step": 1147 }, { "epoch": 0.8612153038259565, "grad_norm": 0.179098442196846, "learning_rate": 7.02550506378352e-05, "loss": 0.0788, "step": 1148 }, { "epoch": 0.8619654913728432, "grad_norm": 0.1804032027721405, "learning_rate": 7.019516938578777e-05, "loss": 0.0571, "step": 1149 }, { "epoch": 0.8627156789197299, "grad_norm": 0.1622643768787384, "learning_rate": 7.013525350101348e-05, "loss": 0.0626, "step": 1150 }, { "epoch": 0.8634658664666166, "grad_norm": 0.12790507078170776, "learning_rate": 7.00753030862622e-05, "loss": 0.0634, "step": 1151 }, { "epoch": 0.8642160540135033, "grad_norm": 0.11260277032852173, "learning_rate": 7.001531824434299e-05, "loss": 0.0523, "step": 1152 }, { "epoch": 0.8649662415603901, "grad_norm": 0.1972770243883133, "learning_rate": 6.995529907812391e-05, "loss": 0.0786, "step": 1153 }, { "epoch": 0.8657164291072769, "grad_norm": 0.14673151075839996, "learning_rate": 6.989524569053196e-05, "loss": 0.0658, "step": 1154 }, { "epoch": 0.8664666166541636, "grad_norm": 0.1285582333803177, "learning_rate": 6.983515818455275e-05, "loss": 0.0459, "step": 1155 }, { "epoch": 0.8672168042010503, "grad_norm": 0.1427638828754425, "learning_rate": 6.977503666323048e-05, "loss": 0.0809, "step": 1156 }, { "epoch": 0.867966991747937, "grad_norm": 0.19241690635681152, "learning_rate": 6.971488122966758e-05, "loss": 0.084, "step": 1157 }, { "epoch": 0.8687171792948237, "grad_norm": 0.1290270984172821, "learning_rate": 6.965469198702475e-05, "loss": 0.0541, "step": 1158 }, { "epoch": 0.8694673668417104, "grad_norm": 0.12479504197835922, "learning_rate": 6.95944690385206e-05, "loss": 0.0617, "step": 1159 }, { "epoch": 0.8702175543885972, "grad_norm": 0.157056525349617, "learning_rate": 6.953421248743154e-05, "loss": 0.0871, "step": 1160 }, { "epoch": 0.8709677419354839, "grad_norm": 0.14428332448005676, "learning_rate": 6.947392243709163e-05, "loss": 0.0512, "step": 1161 }, { "epoch": 0.8717179294823706, "grad_norm": 0.13996249437332153, "learning_rate": 6.941359899089238e-05, "loss": 0.0559, "step": 1162 }, { "epoch": 0.8724681170292573, "grad_norm": 0.17473511397838593, "learning_rate": 6.935324225228254e-05, "loss": 0.0899, "step": 1163 }, { "epoch": 0.873218304576144, "grad_norm": 0.17250919342041016, "learning_rate": 6.929285232476797e-05, "loss": 0.0793, "step": 1164 }, { "epoch": 0.8739684921230307, "grad_norm": 0.16943222284317017, "learning_rate": 6.923242931191148e-05, "loss": 0.0668, "step": 1165 }, { "epoch": 0.8747186796699175, "grad_norm": 0.175174281001091, "learning_rate": 6.917197331733257e-05, "loss": 0.0795, "step": 1166 }, { "epoch": 0.8754688672168042, "grad_norm": 0.16245006024837494, "learning_rate": 6.911148444470731e-05, "loss": 0.0641, "step": 1167 }, { "epoch": 0.8762190547636909, "grad_norm": 0.15257513523101807, "learning_rate": 6.905096279776819e-05, "loss": 0.0574, "step": 1168 }, { "epoch": 0.8769692423105776, "grad_norm": 0.10353401303291321, "learning_rate": 6.899040848030384e-05, "loss": 0.0571, "step": 1169 }, { "epoch": 0.8777194298574643, "grad_norm": 0.11673977226018906, "learning_rate": 6.892982159615895e-05, "loss": 0.076, "step": 1170 }, { "epoch": 0.878469617404351, "grad_norm": 0.17134547233581543, "learning_rate": 6.886920224923408e-05, "loss": 0.0907, "step": 1171 }, { "epoch": 0.8792198049512379, "grad_norm": 0.10142813622951508, "learning_rate": 6.880855054348543e-05, "loss": 0.0355, "step": 1172 }, { "epoch": 0.8799699924981246, "grad_norm": 0.12058088928461075, "learning_rate": 6.874786658292472e-05, "loss": 0.0499, "step": 1173 }, { "epoch": 0.8807201800450113, "grad_norm": 0.14051474630832672, "learning_rate": 6.868715047161896e-05, "loss": 0.0645, "step": 1174 }, { "epoch": 0.881470367591898, "grad_norm": 0.20736153423786163, "learning_rate": 6.862640231369029e-05, "loss": 0.0882, "step": 1175 }, { "epoch": 0.8822205551387847, "grad_norm": 0.13780175149440765, "learning_rate": 6.856562221331581e-05, "loss": 0.0587, "step": 1176 }, { "epoch": 0.8829707426856714, "grad_norm": 0.1332911252975464, "learning_rate": 6.850481027472743e-05, "loss": 0.071, "step": 1177 }, { "epoch": 0.8837209302325582, "grad_norm": 0.09639962762594223, "learning_rate": 6.844396660221164e-05, "loss": 0.0465, "step": 1178 }, { "epoch": 0.8844711177794449, "grad_norm": 0.124673031270504, "learning_rate": 6.838309130010933e-05, "loss": 0.0518, "step": 1179 }, { "epoch": 0.8852213053263316, "grad_norm": 0.12697821855545044, "learning_rate": 6.832218447281566e-05, "loss": 0.0516, "step": 1180 }, { "epoch": 0.8859714928732183, "grad_norm": 0.12598365545272827, "learning_rate": 6.826124622477981e-05, "loss": 0.072, "step": 1181 }, { "epoch": 0.886721680420105, "grad_norm": 0.18293868005275726, "learning_rate": 6.820027666050493e-05, "loss": 0.0527, "step": 1182 }, { "epoch": 0.8874718679669917, "grad_norm": 0.18333552777767181, "learning_rate": 6.813927588454778e-05, "loss": 0.0624, "step": 1183 }, { "epoch": 0.8882220555138785, "grad_norm": 0.14375266432762146, "learning_rate": 6.80782440015187e-05, "loss": 0.0648, "step": 1184 }, { "epoch": 0.8889722430607652, "grad_norm": 0.16927683353424072, "learning_rate": 6.801718111608133e-05, "loss": 0.0919, "step": 1185 }, { "epoch": 0.8897224306076519, "grad_norm": 0.1668524146080017, "learning_rate": 6.795608733295254e-05, "loss": 0.0633, "step": 1186 }, { "epoch": 0.8904726181545386, "grad_norm": 0.16813072562217712, "learning_rate": 6.789496275690215e-05, "loss": 0.0599, "step": 1187 }, { "epoch": 0.8912228057014253, "grad_norm": 0.1353646218776703, "learning_rate": 6.783380749275277e-05, "loss": 0.0529, "step": 1188 }, { "epoch": 0.891972993248312, "grad_norm": 0.12498775124549866, "learning_rate": 6.777262164537966e-05, "loss": 0.0537, "step": 1189 }, { "epoch": 0.8927231807951987, "grad_norm": 0.13221831619739532, "learning_rate": 6.771140531971054e-05, "loss": 0.0529, "step": 1190 }, { "epoch": 0.8934733683420856, "grad_norm": 0.16475389897823334, "learning_rate": 6.765015862072536e-05, "loss": 0.0509, "step": 1191 }, { "epoch": 0.8942235558889723, "grad_norm": 0.12393538653850555, "learning_rate": 6.758888165345619e-05, "loss": 0.048, "step": 1192 }, { "epoch": 0.894973743435859, "grad_norm": 0.1635979413986206, "learning_rate": 6.7527574522987e-05, "loss": 0.0661, "step": 1193 }, { "epoch": 0.8957239309827457, "grad_norm": 0.18363012373447418, "learning_rate": 6.746623733445346e-05, "loss": 0.0491, "step": 1194 }, { "epoch": 0.8964741185296324, "grad_norm": 0.11310746520757675, "learning_rate": 6.740487019304282e-05, "loss": 0.034, "step": 1195 }, { "epoch": 0.8972243060765192, "grad_norm": 0.09733211249113083, "learning_rate": 6.734347320399369e-05, "loss": 0.0337, "step": 1196 }, { "epoch": 0.8979744936234059, "grad_norm": 0.13894496858119965, "learning_rate": 6.728204647259586e-05, "loss": 0.0884, "step": 1197 }, { "epoch": 0.8987246811702926, "grad_norm": 0.11892880499362946, "learning_rate": 6.722059010419013e-05, "loss": 0.0565, "step": 1198 }, { "epoch": 0.8994748687171793, "grad_norm": 0.1761048585176468, "learning_rate": 6.715910420416809e-05, "loss": 0.0649, "step": 1199 }, { "epoch": 0.900225056264066, "grad_norm": 0.14857003092765808, "learning_rate": 6.709758887797205e-05, "loss": 0.0632, "step": 1200 }, { "epoch": 0.900225056264066, "eval_loss": 0.06695682555437088, "eval_runtime": 2.6653, "eval_samples_per_second": 20.26, "eval_steps_per_second": 5.253, "step": 1200 }, { "epoch": 0.9009752438109527, "grad_norm": 0.16598834097385406, "learning_rate": 6.703604423109468e-05, "loss": 0.0695, "step": 1201 }, { "epoch": 0.9017254313578394, "grad_norm": 0.13511797785758972, "learning_rate": 6.697447036907904e-05, "loss": 0.0508, "step": 1202 }, { "epoch": 0.9024756189047262, "grad_norm": 0.08580167591571808, "learning_rate": 6.691286739751824e-05, "loss": 0.0356, "step": 1203 }, { "epoch": 0.9032258064516129, "grad_norm": 0.12339236587285995, "learning_rate": 6.685123542205526e-05, "loss": 0.0414, "step": 1204 }, { "epoch": 0.9039759939984996, "grad_norm": 0.15861302614212036, "learning_rate": 6.678957454838292e-05, "loss": 0.0902, "step": 1205 }, { "epoch": 0.9047261815453863, "grad_norm": 0.15118162333965302, "learning_rate": 6.672788488224352e-05, "loss": 0.0665, "step": 1206 }, { "epoch": 0.905476369092273, "grad_norm": 0.17424209415912628, "learning_rate": 6.666616652942878e-05, "loss": 0.057, "step": 1207 }, { "epoch": 0.9062265566391597, "grad_norm": 0.1301327645778656, "learning_rate": 6.660441959577958e-05, "loss": 0.0744, "step": 1208 }, { "epoch": 0.9069767441860465, "grad_norm": 0.157297745347023, "learning_rate": 6.654264418718584e-05, "loss": 0.0746, "step": 1209 }, { "epoch": 0.9077269317329333, "grad_norm": 0.15477484464645386, "learning_rate": 6.64808404095863e-05, "loss": 0.069, "step": 1210 }, { "epoch": 0.90847711927982, "grad_norm": 0.20592740178108215, "learning_rate": 6.641900836896835e-05, "loss": 0.0669, "step": 1211 }, { "epoch": 0.9092273068267067, "grad_norm": 0.15215907990932465, "learning_rate": 6.635714817136785e-05, "loss": 0.0547, "step": 1212 }, { "epoch": 0.9099774943735934, "grad_norm": 0.13565507531166077, "learning_rate": 6.629525992286898e-05, "loss": 0.0609, "step": 1213 }, { "epoch": 0.9107276819204801, "grad_norm": 0.15176476538181305, "learning_rate": 6.623334372960393e-05, "loss": 0.0746, "step": 1214 }, { "epoch": 0.9114778694673669, "grad_norm": 0.14856791496276855, "learning_rate": 6.617139969775292e-05, "loss": 0.0648, "step": 1215 }, { "epoch": 0.9122280570142536, "grad_norm": 0.17555858194828033, "learning_rate": 6.610942793354387e-05, "loss": 0.0796, "step": 1216 }, { "epoch": 0.9129782445611403, "grad_norm": 0.1357504427433014, "learning_rate": 6.604742854325222e-05, "loss": 0.0612, "step": 1217 }, { "epoch": 0.913728432108027, "grad_norm": 0.14152507483959198, "learning_rate": 6.598540163320084e-05, "loss": 0.0517, "step": 1218 }, { "epoch": 0.9144786196549137, "grad_norm": 0.1403784304857254, "learning_rate": 6.592334730975975e-05, "loss": 0.0838, "step": 1219 }, { "epoch": 0.9152288072018004, "grad_norm": 0.1756121814250946, "learning_rate": 6.586126567934605e-05, "loss": 0.0949, "step": 1220 }, { "epoch": 0.9159789947486872, "grad_norm": 0.12650136649608612, "learning_rate": 6.57991568484236e-05, "loss": 0.0559, "step": 1221 }, { "epoch": 0.9167291822955739, "grad_norm": 0.11986692994832993, "learning_rate": 6.573702092350292e-05, "loss": 0.0527, "step": 1222 }, { "epoch": 0.9174793698424606, "grad_norm": 0.16555757820606232, "learning_rate": 6.567485801114099e-05, "loss": 0.0614, "step": 1223 }, { "epoch": 0.9182295573893473, "grad_norm": 0.08092650026082993, "learning_rate": 6.561266821794111e-05, "loss": 0.0388, "step": 1224 }, { "epoch": 0.918979744936234, "grad_norm": 0.1932453215122223, "learning_rate": 6.555045165055263e-05, "loss": 0.0803, "step": 1225 }, { "epoch": 0.9197299324831207, "grad_norm": 0.16399742662906647, "learning_rate": 6.548820841567086e-05, "loss": 0.0747, "step": 1226 }, { "epoch": 0.9204801200300075, "grad_norm": 0.13225746154785156, "learning_rate": 6.54259386200368e-05, "loss": 0.0602, "step": 1227 }, { "epoch": 0.9212303075768942, "grad_norm": 0.11068938672542572, "learning_rate": 6.536364237043703e-05, "loss": 0.0504, "step": 1228 }, { "epoch": 0.921980495123781, "grad_norm": 0.1256033480167389, "learning_rate": 6.530131977370348e-05, "loss": 0.05, "step": 1229 }, { "epoch": 0.9227306826706677, "grad_norm": 0.13208834826946259, "learning_rate": 6.523897093671326e-05, "loss": 0.0513, "step": 1230 }, { "epoch": 0.9234808702175544, "grad_norm": 0.12550483644008636, "learning_rate": 6.51765959663885e-05, "loss": 0.0597, "step": 1231 }, { "epoch": 0.9242310577644411, "grad_norm": 0.19324685633182526, "learning_rate": 6.511419496969612e-05, "loss": 0.0752, "step": 1232 }, { "epoch": 0.9249812453113279, "grad_norm": 0.1724669337272644, "learning_rate": 6.505176805364767e-05, "loss": 0.0825, "step": 1233 }, { "epoch": 0.9257314328582146, "grad_norm": 0.10835179686546326, "learning_rate": 6.498931532529921e-05, "loss": 0.0633, "step": 1234 }, { "epoch": 0.9264816204051013, "grad_norm": 0.15429191291332245, "learning_rate": 6.492683689175098e-05, "loss": 0.0492, "step": 1235 }, { "epoch": 0.927231807951988, "grad_norm": 0.1609717160463333, "learning_rate": 6.486433286014734e-05, "loss": 0.0638, "step": 1236 }, { "epoch": 0.9279819954988747, "grad_norm": 0.17381221055984497, "learning_rate": 6.480180333767658e-05, "loss": 0.0784, "step": 1237 }, { "epoch": 0.9287321830457614, "grad_norm": 0.14453348517417908, "learning_rate": 6.473924843157065e-05, "loss": 0.0557, "step": 1238 }, { "epoch": 0.9294823705926482, "grad_norm": 0.12073624134063721, "learning_rate": 6.467666824910505e-05, "loss": 0.0625, "step": 1239 }, { "epoch": 0.9302325581395349, "grad_norm": 0.16522017121315002, "learning_rate": 6.461406289759862e-05, "loss": 0.0746, "step": 1240 }, { "epoch": 0.9309827456864216, "grad_norm": 0.17048360407352448, "learning_rate": 6.455143248441342e-05, "loss": 0.0855, "step": 1241 }, { "epoch": 0.9317329332333083, "grad_norm": 0.12252648919820786, "learning_rate": 6.44887771169544e-05, "loss": 0.0524, "step": 1242 }, { "epoch": 0.932483120780195, "grad_norm": 0.14796991646289825, "learning_rate": 6.442609690266937e-05, "loss": 0.068, "step": 1243 }, { "epoch": 0.9332333083270817, "grad_norm": 0.1838654726743698, "learning_rate": 6.436339194904872e-05, "loss": 0.0478, "step": 1244 }, { "epoch": 0.9339834958739685, "grad_norm": 0.12232690304517746, "learning_rate": 6.430066236362524e-05, "loss": 0.0537, "step": 1245 }, { "epoch": 0.9347336834208552, "grad_norm": 0.18929807841777802, "learning_rate": 6.423790825397404e-05, "loss": 0.0735, "step": 1246 }, { "epoch": 0.9354838709677419, "grad_norm": 0.12835638225078583, "learning_rate": 6.417512972771219e-05, "loss": 0.0472, "step": 1247 }, { "epoch": 0.9362340585146287, "grad_norm": 0.3227969706058502, "learning_rate": 6.411232689249873e-05, "loss": 0.0836, "step": 1248 }, { "epoch": 0.9369842460615154, "grad_norm": 0.13637347519397736, "learning_rate": 6.40494998560343e-05, "loss": 0.0417, "step": 1249 }, { "epoch": 0.9377344336084021, "grad_norm": 0.18338750302791595, "learning_rate": 6.39866487260611e-05, "loss": 0.063, "step": 1250 }, { "epoch": 0.9384846211552889, "grad_norm": 0.08587335050106049, "learning_rate": 6.392377361036262e-05, "loss": 0.0332, "step": 1251 }, { "epoch": 0.9392348087021756, "grad_norm": 0.1209770143032074, "learning_rate": 6.386087461676351e-05, "loss": 0.0523, "step": 1252 }, { "epoch": 0.9399849962490623, "grad_norm": 0.11411428451538086, "learning_rate": 6.379795185312933e-05, "loss": 0.0397, "step": 1253 }, { "epoch": 0.940735183795949, "grad_norm": 0.21214881539344788, "learning_rate": 6.373500542736643e-05, "loss": 0.0745, "step": 1254 }, { "epoch": 0.9414853713428357, "grad_norm": 0.1349548101425171, "learning_rate": 6.367203544742171e-05, "loss": 0.0468, "step": 1255 }, { "epoch": 0.9422355588897224, "grad_norm": 0.188161700963974, "learning_rate": 6.360904202128252e-05, "loss": 0.0771, "step": 1256 }, { "epoch": 0.9429857464366092, "grad_norm": 0.2221253365278244, "learning_rate": 6.354602525697638e-05, "loss": 0.0785, "step": 1257 }, { "epoch": 0.9437359339834959, "grad_norm": 0.19272452592849731, "learning_rate": 6.348298526257082e-05, "loss": 0.0738, "step": 1258 }, { "epoch": 0.9444861215303826, "grad_norm": 0.1681603491306305, "learning_rate": 6.341992214617323e-05, "loss": 0.0609, "step": 1259 }, { "epoch": 0.9452363090772693, "grad_norm": 0.1713087409734726, "learning_rate": 6.335683601593062e-05, "loss": 0.0962, "step": 1260 }, { "epoch": 0.945986496624156, "grad_norm": 0.13711421191692352, "learning_rate": 6.329372698002954e-05, "loss": 0.0569, "step": 1261 }, { "epoch": 0.9467366841710427, "grad_norm": 0.3579675555229187, "learning_rate": 6.323059514669571e-05, "loss": 0.1269, "step": 1262 }, { "epoch": 0.9474868717179294, "grad_norm": 0.20477552711963654, "learning_rate": 6.316744062419409e-05, "loss": 0.0785, "step": 1263 }, { "epoch": 0.9482370592648162, "grad_norm": 0.166034996509552, "learning_rate": 6.310426352082838e-05, "loss": 0.0651, "step": 1264 }, { "epoch": 0.9489872468117029, "grad_norm": 0.1207178607583046, "learning_rate": 6.304106394494116e-05, "loss": 0.0461, "step": 1265 }, { "epoch": 0.9497374343585896, "grad_norm": 0.21342095732688904, "learning_rate": 6.297784200491343e-05, "loss": 0.0815, "step": 1266 }, { "epoch": 0.9504876219054764, "grad_norm": 0.11947363615036011, "learning_rate": 6.291459780916463e-05, "loss": 0.0342, "step": 1267 }, { "epoch": 0.9512378094523631, "grad_norm": 0.15550976991653442, "learning_rate": 6.285133146615228e-05, "loss": 0.0526, "step": 1268 }, { "epoch": 0.9519879969992499, "grad_norm": 0.14392493665218353, "learning_rate": 6.278804308437198e-05, "loss": 0.0523, "step": 1269 }, { "epoch": 0.9527381845461366, "grad_norm": 0.13637441396713257, "learning_rate": 6.272473277235703e-05, "loss": 0.0539, "step": 1270 }, { "epoch": 0.9534883720930233, "grad_norm": 0.20482169091701508, "learning_rate": 6.266140063867843e-05, "loss": 0.0784, "step": 1271 }, { "epoch": 0.95423855963991, "grad_norm": 0.19494204223155975, "learning_rate": 6.25980467919445e-05, "loss": 0.115, "step": 1272 }, { "epoch": 0.9549887471867967, "grad_norm": 0.16727079451084137, "learning_rate": 6.253467134080088e-05, "loss": 0.0623, "step": 1273 }, { "epoch": 0.9557389347336834, "grad_norm": 0.13359226286411285, "learning_rate": 6.247127439393023e-05, "loss": 0.0501, "step": 1274 }, { "epoch": 0.9564891222805701, "grad_norm": 0.20405220985412598, "learning_rate": 6.240785606005206e-05, "loss": 0.0644, "step": 1275 }, { "epoch": 0.9572393098274569, "grad_norm": 0.19779805839061737, "learning_rate": 6.234441644792256e-05, "loss": 0.0424, "step": 1276 }, { "epoch": 0.9579894973743436, "grad_norm": 0.1623539924621582, "learning_rate": 6.228095566633443e-05, "loss": 0.0743, "step": 1277 }, { "epoch": 0.9587396849212303, "grad_norm": 0.1125299483537674, "learning_rate": 6.221747382411667e-05, "loss": 0.0498, "step": 1278 }, { "epoch": 0.959489872468117, "grad_norm": 0.15883377194404602, "learning_rate": 6.215397103013436e-05, "loss": 0.056, "step": 1279 }, { "epoch": 0.9602400600150037, "grad_norm": 0.13185837864875793, "learning_rate": 6.209044739328858e-05, "loss": 0.0581, "step": 1280 }, { "epoch": 0.9609902475618904, "grad_norm": 0.14542366564273834, "learning_rate": 6.202690302251606e-05, "loss": 0.0692, "step": 1281 }, { "epoch": 0.9617404351087772, "grad_norm": 0.12652628123760223, "learning_rate": 6.196333802678918e-05, "loss": 0.0656, "step": 1282 }, { "epoch": 0.9624906226556639, "grad_norm": 0.26526379585266113, "learning_rate": 6.189975251511562e-05, "loss": 0.0915, "step": 1283 }, { "epoch": 0.9632408102025506, "grad_norm": 0.18798717856407166, "learning_rate": 6.18361465965383e-05, "loss": 0.0685, "step": 1284 }, { "epoch": 0.9639909977494373, "grad_norm": 0.12187514454126358, "learning_rate": 6.177252038013509e-05, "loss": 0.0455, "step": 1285 }, { "epoch": 0.9647411852963241, "grad_norm": 0.09327728301286697, "learning_rate": 6.170887397501868e-05, "loss": 0.0389, "step": 1286 }, { "epoch": 0.9654913728432108, "grad_norm": 0.2176682949066162, "learning_rate": 6.16452074903364e-05, "loss": 0.0718, "step": 1287 }, { "epoch": 0.9662415603900976, "grad_norm": 0.24573443830013275, "learning_rate": 6.158152103527e-05, "loss": 0.1097, "step": 1288 }, { "epoch": 0.9669917479369843, "grad_norm": 0.17032325267791748, "learning_rate": 6.151781471903548e-05, "loss": 0.076, "step": 1289 }, { "epoch": 0.967741935483871, "grad_norm": 0.22234180569648743, "learning_rate": 6.14540886508829e-05, "loss": 0.1093, "step": 1290 }, { "epoch": 0.9684921230307577, "grad_norm": 0.15120472013950348, "learning_rate": 6.139034294009617e-05, "loss": 0.0637, "step": 1291 }, { "epoch": 0.9692423105776444, "grad_norm": 0.14784853160381317, "learning_rate": 6.132657769599293e-05, "loss": 0.0476, "step": 1292 }, { "epoch": 0.9699924981245311, "grad_norm": 0.20848771929740906, "learning_rate": 6.126279302792429e-05, "loss": 0.086, "step": 1293 }, { "epoch": 0.9707426856714179, "grad_norm": 0.12628328800201416, "learning_rate": 6.119898904527468e-05, "loss": 0.0324, "step": 1294 }, { "epoch": 0.9714928732183046, "grad_norm": 0.16405990719795227, "learning_rate": 6.113516585746164e-05, "loss": 0.1027, "step": 1295 }, { "epoch": 0.9722430607651913, "grad_norm": 0.1743471920490265, "learning_rate": 6.107132357393563e-05, "loss": 0.0805, "step": 1296 }, { "epoch": 0.972993248312078, "grad_norm": 0.12977734208106995, "learning_rate": 6.100746230417993e-05, "loss": 0.0467, "step": 1297 }, { "epoch": 0.9737434358589647, "grad_norm": 0.1675756424665451, "learning_rate": 6.0943582157710285e-05, "loss": 0.0446, "step": 1298 }, { "epoch": 0.9744936234058514, "grad_norm": 0.23262391984462738, "learning_rate": 6.0879683244074894e-05, "loss": 0.0699, "step": 1299 }, { "epoch": 0.9752438109527382, "grad_norm": 0.12139663100242615, "learning_rate": 6.0815765672854065e-05, "loss": 0.0521, "step": 1300 }, { "epoch": 0.9759939984996249, "grad_norm": 0.12815728783607483, "learning_rate": 6.0751829553660155e-05, "loss": 0.044, "step": 1301 }, { "epoch": 0.9767441860465116, "grad_norm": 0.1259387880563736, "learning_rate": 6.06878749961373e-05, "loss": 0.0634, "step": 1302 }, { "epoch": 0.9774943735933983, "grad_norm": 0.1548275500535965, "learning_rate": 6.0623902109961295e-05, "loss": 0.0569, "step": 1303 }, { "epoch": 0.978244561140285, "grad_norm": 0.25065237283706665, "learning_rate": 6.055991100483932e-05, "loss": 0.0915, "step": 1304 }, { "epoch": 0.9789947486871718, "grad_norm": 0.18947431445121765, "learning_rate": 6.0495901790509836e-05, "loss": 0.08, "step": 1305 }, { "epoch": 0.9797449362340586, "grad_norm": 0.1380397528409958, "learning_rate": 6.043187457674231e-05, "loss": 0.0528, "step": 1306 }, { "epoch": 0.9804951237809453, "grad_norm": 0.1665130853652954, "learning_rate": 6.0367829473337136e-05, "loss": 0.0752, "step": 1307 }, { "epoch": 0.981245311327832, "grad_norm": 0.11333315819501877, "learning_rate": 6.0303766590125365e-05, "loss": 0.0368, "step": 1308 }, { "epoch": 0.9819954988747187, "grad_norm": 0.1431434005498886, "learning_rate": 6.02396860369685e-05, "loss": 0.0669, "step": 1309 }, { "epoch": 0.9827456864216054, "grad_norm": 0.1723565310239792, "learning_rate": 6.0175587923758416e-05, "loss": 0.0724, "step": 1310 }, { "epoch": 0.9834958739684921, "grad_norm": 0.15027496218681335, "learning_rate": 6.0111472360417044e-05, "loss": 0.0457, "step": 1311 }, { "epoch": 0.9842460615153789, "grad_norm": 0.169467493891716, "learning_rate": 6.004733945689628e-05, "loss": 0.0643, "step": 1312 }, { "epoch": 0.9849962490622656, "grad_norm": 0.14806106686592102, "learning_rate": 5.998318932317771e-05, "loss": 0.0707, "step": 1313 }, { "epoch": 0.9857464366091523, "grad_norm": 0.15697737038135529, "learning_rate": 5.991902206927252e-05, "loss": 0.0622, "step": 1314 }, { "epoch": 0.986496624156039, "grad_norm": 0.10275126993656158, "learning_rate": 5.985483780522122e-05, "loss": 0.0465, "step": 1315 }, { "epoch": 0.9872468117029257, "grad_norm": 0.12026581913232803, "learning_rate": 5.9790636641093523e-05, "loss": 0.0379, "step": 1316 }, { "epoch": 0.9879969992498124, "grad_norm": 0.16514074802398682, "learning_rate": 5.972641868698805e-05, "loss": 0.0525, "step": 1317 }, { "epoch": 0.9887471867966992, "grad_norm": 0.18575632572174072, "learning_rate": 5.966218405303234e-05, "loss": 0.073, "step": 1318 }, { "epoch": 0.9894973743435859, "grad_norm": 0.12170587480068207, "learning_rate": 5.959793284938242e-05, "loss": 0.0568, "step": 1319 }, { "epoch": 0.9902475618904726, "grad_norm": 0.18904197216033936, "learning_rate": 5.953366518622279e-05, "loss": 0.0771, "step": 1320 }, { "epoch": 0.9909977494373593, "grad_norm": 0.13215796649456024, "learning_rate": 5.946938117376616e-05, "loss": 0.0524, "step": 1321 }, { "epoch": 0.991747936984246, "grad_norm": 0.13645458221435547, "learning_rate": 5.940508092225328e-05, "loss": 0.0735, "step": 1322 }, { "epoch": 0.9924981245311327, "grad_norm": 0.16823965311050415, "learning_rate": 5.9340764541952755e-05, "loss": 0.0525, "step": 1323 }, { "epoch": 0.9932483120780196, "grad_norm": 0.13981732726097107, "learning_rate": 5.9276432143160835e-05, "loss": 0.0487, "step": 1324 }, { "epoch": 0.9939984996249063, "grad_norm": 0.1347687840461731, "learning_rate": 5.921208383620126e-05, "loss": 0.0375, "step": 1325 }, { "epoch": 0.994748687171793, "grad_norm": 0.2246084362268448, "learning_rate": 5.9147719731425034e-05, "loss": 0.0788, "step": 1326 }, { "epoch": 0.9954988747186797, "grad_norm": 0.13317465782165527, "learning_rate": 5.908333993921027e-05, "loss": 0.0718, "step": 1327 }, { "epoch": 0.9962490622655664, "grad_norm": 0.11800669133663177, "learning_rate": 5.901894456996196e-05, "loss": 0.053, "step": 1328 }, { "epoch": 0.9969992498124531, "grad_norm": 0.23010219633579254, "learning_rate": 5.895453373411182e-05, "loss": 0.1, "step": 1329 }, { "epoch": 0.9977494373593399, "grad_norm": 0.2343241274356842, "learning_rate": 5.889010754211809e-05, "loss": 0.0774, "step": 1330 }, { "epoch": 0.9984996249062266, "grad_norm": 0.16504386067390442, "learning_rate": 5.882566610446534e-05, "loss": 0.0441, "step": 1331 }, { "epoch": 0.9992498124531133, "grad_norm": 0.18011681735515594, "learning_rate": 5.8761209531664306e-05, "loss": 0.0792, "step": 1332 }, { "epoch": 1.0, "grad_norm": 0.1674385368824005, "learning_rate": 5.869673793425168e-05, "loss": 0.067, "step": 1333 }, { "epoch": 1.0007501875468867, "grad_norm": 0.15250438451766968, "learning_rate": 5.863225142278985e-05, "loss": 0.0647, "step": 1334 }, { "epoch": 1.0015003750937734, "grad_norm": 0.11759188026189804, "learning_rate": 5.856775010786687e-05, "loss": 0.0414, "step": 1335 }, { "epoch": 1.0022505626406601, "grad_norm": 0.13916079699993134, "learning_rate": 5.850323410009614e-05, "loss": 0.0442, "step": 1336 }, { "epoch": 1.0030007501875469, "grad_norm": 0.17896828055381775, "learning_rate": 5.8438703510116256e-05, "loss": 0.0503, "step": 1337 }, { "epoch": 1.0037509377344336, "grad_norm": 0.1747381091117859, "learning_rate": 5.8374158448590823e-05, "loss": 0.0475, "step": 1338 }, { "epoch": 1.0045011252813203, "grad_norm": 0.12167813628911972, "learning_rate": 5.830959902620826e-05, "loss": 0.0425, "step": 1339 }, { "epoch": 1.005251312828207, "grad_norm": 0.13961336016654968, "learning_rate": 5.824502535368164e-05, "loss": 0.048, "step": 1340 }, { "epoch": 1.0060015003750937, "grad_norm": 0.1229967400431633, "learning_rate": 5.818043754174843e-05, "loss": 0.0498, "step": 1341 }, { "epoch": 1.0067516879219804, "grad_norm": 0.20583300292491913, "learning_rate": 5.81158357011704e-05, "loss": 0.0608, "step": 1342 }, { "epoch": 1.0075018754688672, "grad_norm": 0.07669761776924133, "learning_rate": 5.80512199427333e-05, "loss": 0.0243, "step": 1343 }, { "epoch": 1.0082520630157539, "grad_norm": 0.11683350801467896, "learning_rate": 5.798659037724683e-05, "loss": 0.0374, "step": 1344 }, { "epoch": 1.0090022505626406, "grad_norm": 0.137917622923851, "learning_rate": 5.792194711554429e-05, "loss": 0.0536, "step": 1345 }, { "epoch": 1.0097524381095273, "grad_norm": 0.1384626179933548, "learning_rate": 5.7857290268482555e-05, "loss": 0.0779, "step": 1346 }, { "epoch": 1.010502625656414, "grad_norm": 0.12315264344215393, "learning_rate": 5.779261994694173e-05, "loss": 0.0271, "step": 1347 }, { "epoch": 1.0112528132033007, "grad_norm": 0.23275774717330933, "learning_rate": 5.772793626182506e-05, "loss": 0.0513, "step": 1348 }, { "epoch": 1.0120030007501875, "grad_norm": 0.12899671494960785, "learning_rate": 5.766323932405866e-05, "loss": 0.0489, "step": 1349 }, { "epoch": 1.0127531882970742, "grad_norm": 0.1920711249113083, "learning_rate": 5.7598529244591436e-05, "loss": 0.0495, "step": 1350 }, { "epoch": 1.0135033758439609, "grad_norm": 0.20560826361179352, "learning_rate": 5.7533806134394806e-05, "loss": 0.0483, "step": 1351 }, { "epoch": 1.0142535633908478, "grad_norm": 0.11408060789108276, "learning_rate": 5.746907010446252e-05, "loss": 0.0394, "step": 1352 }, { "epoch": 1.0150037509377345, "grad_norm": 0.1268560141324997, "learning_rate": 5.740432126581049e-05, "loss": 0.0371, "step": 1353 }, { "epoch": 1.0157539384846213, "grad_norm": 0.11626963317394257, "learning_rate": 5.73395597294766e-05, "loss": 0.0246, "step": 1354 }, { "epoch": 1.016504126031508, "grad_norm": 0.10146208852529526, "learning_rate": 5.727478560652053e-05, "loss": 0.0364, "step": 1355 }, { "epoch": 1.0172543135783947, "grad_norm": 0.17881205677986145, "learning_rate": 5.7209999008023496e-05, "loss": 0.0805, "step": 1356 }, { "epoch": 1.0180045011252814, "grad_norm": 0.16511453688144684, "learning_rate": 5.7145200045088156e-05, "loss": 0.0626, "step": 1357 }, { "epoch": 1.0187546886721681, "grad_norm": 0.11539970338344574, "learning_rate": 5.7080388828838324e-05, "loss": 0.0403, "step": 1358 }, { "epoch": 1.0195048762190548, "grad_norm": 0.17582516372203827, "learning_rate": 5.701556547041888e-05, "loss": 0.0615, "step": 1359 }, { "epoch": 1.0202550637659416, "grad_norm": 0.2376180738210678, "learning_rate": 5.695073008099547e-05, "loss": 0.0491, "step": 1360 }, { "epoch": 1.0210052513128283, "grad_norm": 0.2256619781255722, "learning_rate": 5.688588277175444e-05, "loss": 0.0403, "step": 1361 }, { "epoch": 1.021755438859715, "grad_norm": 0.14331862330436707, "learning_rate": 5.6821023653902517e-05, "loss": 0.0411, "step": 1362 }, { "epoch": 1.0225056264066017, "grad_norm": 0.12017953395843506, "learning_rate": 5.675615283866671e-05, "loss": 0.0421, "step": 1363 }, { "epoch": 1.0232558139534884, "grad_norm": 0.16904471814632416, "learning_rate": 5.669127043729406e-05, "loss": 0.0608, "step": 1364 }, { "epoch": 1.0240060015003751, "grad_norm": 0.13869859278202057, "learning_rate": 5.662637656105152e-05, "loss": 0.0443, "step": 1365 }, { "epoch": 1.0247561890472618, "grad_norm": 0.1698876917362213, "learning_rate": 5.6561471321225676e-05, "loss": 0.0489, "step": 1366 }, { "epoch": 1.0255063765941486, "grad_norm": 0.2037367969751358, "learning_rate": 5.649655482912265e-05, "loss": 0.0716, "step": 1367 }, { "epoch": 1.0262565641410353, "grad_norm": 0.1325848251581192, "learning_rate": 5.6431627196067816e-05, "loss": 0.0332, "step": 1368 }, { "epoch": 1.027006751687922, "grad_norm": 0.13476696610450745, "learning_rate": 5.636668853340567e-05, "loss": 0.0397, "step": 1369 }, { "epoch": 1.0277569392348087, "grad_norm": 0.0886571854352951, "learning_rate": 5.6301738952499636e-05, "loss": 0.0344, "step": 1370 }, { "epoch": 1.0285071267816954, "grad_norm": 0.1936550736427307, "learning_rate": 5.623677856473183e-05, "loss": 0.0761, "step": 1371 }, { "epoch": 1.0292573143285821, "grad_norm": 0.16046838462352753, "learning_rate": 5.617180748150295e-05, "loss": 0.0409, "step": 1372 }, { "epoch": 1.0300075018754689, "grad_norm": 0.10154321044683456, "learning_rate": 5.6106825814231953e-05, "loss": 0.0335, "step": 1373 }, { "epoch": 1.0307576894223556, "grad_norm": 0.2043825089931488, "learning_rate": 5.604183367435606e-05, "loss": 0.0494, "step": 1374 }, { "epoch": 1.0315078769692423, "grad_norm": 0.21420907974243164, "learning_rate": 5.597683117333036e-05, "loss": 0.0599, "step": 1375 }, { "epoch": 1.032258064516129, "grad_norm": 0.16880439221858978, "learning_rate": 5.591181842262776e-05, "loss": 0.0421, "step": 1376 }, { "epoch": 1.0330082520630157, "grad_norm": 0.2568267285823822, "learning_rate": 5.584679553373869e-05, "loss": 0.0533, "step": 1377 }, { "epoch": 1.0337584396099024, "grad_norm": 0.12662667036056519, "learning_rate": 5.578176261817104e-05, "loss": 0.0405, "step": 1378 }, { "epoch": 1.0345086271567892, "grad_norm": 0.17100553214550018, "learning_rate": 5.571671978744983e-05, "loss": 0.0481, "step": 1379 }, { "epoch": 1.0352588147036759, "grad_norm": 0.13645890355110168, "learning_rate": 5.565166715311711e-05, "loss": 0.0419, "step": 1380 }, { "epoch": 1.0360090022505626, "grad_norm": 0.10729946196079254, "learning_rate": 5.558660482673177e-05, "loss": 0.0393, "step": 1381 }, { "epoch": 1.0367591897974493, "grad_norm": 0.15839707851409912, "learning_rate": 5.552153291986927e-05, "loss": 0.0487, "step": 1382 }, { "epoch": 1.037509377344336, "grad_norm": 0.12748658657073975, "learning_rate": 5.5456451544121523e-05, "loss": 0.0492, "step": 1383 }, { "epoch": 1.0382595648912227, "grad_norm": 0.19807954132556915, "learning_rate": 5.5391360811096684e-05, "loss": 0.0418, "step": 1384 }, { "epoch": 1.0390097524381094, "grad_norm": 0.12736938893795013, "learning_rate": 5.5326260832418955e-05, "loss": 0.0441, "step": 1385 }, { "epoch": 1.0397599399849962, "grad_norm": 0.16423308849334717, "learning_rate": 5.526115171972838e-05, "loss": 0.0501, "step": 1386 }, { "epoch": 1.0405101275318829, "grad_norm": 0.15221011638641357, "learning_rate": 5.5196033584680675e-05, "loss": 0.0548, "step": 1387 }, { "epoch": 1.0412603150787696, "grad_norm": 0.14308512210845947, "learning_rate": 5.5130906538947034e-05, "loss": 0.0416, "step": 1388 }, { "epoch": 1.0420105026256563, "grad_norm": 0.17685441672801971, "learning_rate": 5.506577069421395e-05, "loss": 0.0358, "step": 1389 }, { "epoch": 1.042760690172543, "grad_norm": 0.20115597546100616, "learning_rate": 5.5000626162182944e-05, "loss": 0.043, "step": 1390 }, { "epoch": 1.04351087771943, "grad_norm": 0.17785103619098663, "learning_rate": 5.49354730545705e-05, "loss": 0.0536, "step": 1391 }, { "epoch": 1.0442610652663167, "grad_norm": 0.1535765528678894, "learning_rate": 5.487031148310775e-05, "loss": 0.0333, "step": 1392 }, { "epoch": 1.0450112528132034, "grad_norm": 0.14303374290466309, "learning_rate": 5.480514155954042e-05, "loss": 0.0469, "step": 1393 }, { "epoch": 1.04576144036009, "grad_norm": 0.11257001012563705, "learning_rate": 5.4739963395628456e-05, "loss": 0.0307, "step": 1394 }, { "epoch": 1.0465116279069768, "grad_norm": 0.16565249860286713, "learning_rate": 5.4674777103146045e-05, "loss": 0.0516, "step": 1395 }, { "epoch": 1.0472618154538635, "grad_norm": 0.16822345554828644, "learning_rate": 5.460958279388122e-05, "loss": 0.0518, "step": 1396 }, { "epoch": 1.0480120030007503, "grad_norm": 0.1288178563117981, "learning_rate": 5.4544380579635824e-05, "loss": 0.0395, "step": 1397 }, { "epoch": 1.048762190547637, "grad_norm": 0.1471712738275528, "learning_rate": 5.447917057222523e-05, "loss": 0.0543, "step": 1398 }, { "epoch": 1.0495123780945237, "grad_norm": 0.13822641968727112, "learning_rate": 5.441395288347818e-05, "loss": 0.044, "step": 1399 }, { "epoch": 1.0502625656414104, "grad_norm": 0.113441601395607, "learning_rate": 5.434872762523658e-05, "loss": 0.0446, "step": 1400 }, { "epoch": 1.0502625656414104, "eval_loss": 0.0680478885769844, "eval_runtime": 2.6592, "eval_samples_per_second": 20.307, "eval_steps_per_second": 5.265, "step": 1400 }, { "epoch": 1.0510127531882971, "grad_norm": 0.1047082468867302, "learning_rate": 5.4283494909355314e-05, "loss": 0.0316, "step": 1401 }, { "epoch": 1.0517629407351838, "grad_norm": 0.2028200924396515, "learning_rate": 5.42182548477021e-05, "loss": 0.0337, "step": 1402 }, { "epoch": 1.0525131282820706, "grad_norm": 0.13617637753486633, "learning_rate": 5.41530075521572e-05, "loss": 0.0345, "step": 1403 }, { "epoch": 1.0532633158289573, "grad_norm": 0.18374083936214447, "learning_rate": 5.4087753134613294e-05, "loss": 0.0575, "step": 1404 }, { "epoch": 1.054013503375844, "grad_norm": 0.18506267666816711, "learning_rate": 5.40224917069753e-05, "loss": 0.0466, "step": 1405 }, { "epoch": 1.0547636909227307, "grad_norm": 0.15255096554756165, "learning_rate": 5.3957223381160126e-05, "loss": 0.0337, "step": 1406 }, { "epoch": 1.0555138784696174, "grad_norm": 0.18384674191474915, "learning_rate": 5.389194826909653e-05, "loss": 0.0491, "step": 1407 }, { "epoch": 1.0562640660165041, "grad_norm": 0.1939430981874466, "learning_rate": 5.382666648272489e-05, "loss": 0.0941, "step": 1408 }, { "epoch": 1.0570142535633908, "grad_norm": 0.1425727754831314, "learning_rate": 5.3761378133997044e-05, "loss": 0.0483, "step": 1409 }, { "epoch": 1.0577644411102776, "grad_norm": 0.1924968659877777, "learning_rate": 5.3696083334876105e-05, "loss": 0.0747, "step": 1410 }, { "epoch": 1.0585146286571643, "grad_norm": 0.1328912377357483, "learning_rate": 5.363078219733619e-05, "loss": 0.0401, "step": 1411 }, { "epoch": 1.059264816204051, "grad_norm": 0.16654489934444427, "learning_rate": 5.3565474833362353e-05, "loss": 0.0504, "step": 1412 }, { "epoch": 1.0600150037509377, "grad_norm": 0.11094582080841064, "learning_rate": 5.3500161354950274e-05, "loss": 0.0375, "step": 1413 }, { "epoch": 1.0607651912978244, "grad_norm": 0.20550234615802765, "learning_rate": 5.3434841874106124e-05, "loss": 0.0512, "step": 1414 }, { "epoch": 1.0615153788447111, "grad_norm": 0.12731848657131195, "learning_rate": 5.3369516502846396e-05, "loss": 0.0648, "step": 1415 }, { "epoch": 1.0622655663915979, "grad_norm": 0.14043542742729187, "learning_rate": 5.330418535319768e-05, "loss": 0.0588, "step": 1416 }, { "epoch": 1.0630157539384846, "grad_norm": 0.1502482295036316, "learning_rate": 5.323884853719645e-05, "loss": 0.0488, "step": 1417 }, { "epoch": 1.0637659414853713, "grad_norm": 0.11024313420057297, "learning_rate": 5.31735061668889e-05, "loss": 0.0261, "step": 1418 }, { "epoch": 1.064516129032258, "grad_norm": 0.15195825695991516, "learning_rate": 5.3108158354330795e-05, "loss": 0.0403, "step": 1419 }, { "epoch": 1.0652663165791447, "grad_norm": 0.11762233078479767, "learning_rate": 5.304280521158716e-05, "loss": 0.0276, "step": 1420 }, { "epoch": 1.0660165041260314, "grad_norm": 0.23322537541389465, "learning_rate": 5.2977446850732236e-05, "loss": 0.0384, "step": 1421 }, { "epoch": 1.0667666916729182, "grad_norm": 0.18611235916614532, "learning_rate": 5.291208338384913e-05, "loss": 0.0606, "step": 1422 }, { "epoch": 1.0675168792198049, "grad_norm": 0.20848964154720306, "learning_rate": 5.2846714923029795e-05, "loss": 0.0468, "step": 1423 }, { "epoch": 1.0682670667666916, "grad_norm": 0.24529072642326355, "learning_rate": 5.278134158037469e-05, "loss": 0.0634, "step": 1424 }, { "epoch": 1.0690172543135783, "grad_norm": 0.10874413698911667, "learning_rate": 5.2715963467992656e-05, "loss": 0.0273, "step": 1425 }, { "epoch": 1.069767441860465, "grad_norm": 0.18923908472061157, "learning_rate": 5.265058069800072e-05, "loss": 0.0757, "step": 1426 }, { "epoch": 1.0705176294073517, "grad_norm": 0.12763912975788116, "learning_rate": 5.258519338252389e-05, "loss": 0.049, "step": 1427 }, { "epoch": 1.0712678169542387, "grad_norm": 0.13909712433815002, "learning_rate": 5.251980163369499e-05, "loss": 0.0453, "step": 1428 }, { "epoch": 1.0720180045011252, "grad_norm": 0.1828707605600357, "learning_rate": 5.24544055636544e-05, "loss": 0.0451, "step": 1429 }, { "epoch": 1.072768192048012, "grad_norm": 0.14683784544467926, "learning_rate": 5.2389005284549954e-05, "loss": 0.048, "step": 1430 }, { "epoch": 1.0735183795948988, "grad_norm": 0.19041985273361206, "learning_rate": 5.232360090853671e-05, "loss": 0.0724, "step": 1431 }, { "epoch": 1.0742685671417855, "grad_norm": 0.2441704124212265, "learning_rate": 5.225819254777671e-05, "loss": 0.0564, "step": 1432 }, { "epoch": 1.0750187546886723, "grad_norm": 0.12937527894973755, "learning_rate": 5.219278031443886e-05, "loss": 0.0267, "step": 1433 }, { "epoch": 1.075768942235559, "grad_norm": 0.2045406699180603, "learning_rate": 5.21273643206987e-05, "loss": 0.0451, "step": 1434 }, { "epoch": 1.0765191297824457, "grad_norm": 0.18630985915660858, "learning_rate": 5.206194467873822e-05, "loss": 0.0597, "step": 1435 }, { "epoch": 1.0772693173293324, "grad_norm": 0.17303740978240967, "learning_rate": 5.1996521500745645e-05, "loss": 0.0718, "step": 1436 }, { "epoch": 1.0780195048762191, "grad_norm": 0.17732733488082886, "learning_rate": 5.19310948989153e-05, "loss": 0.041, "step": 1437 }, { "epoch": 1.0787696924231058, "grad_norm": 0.12811049818992615, "learning_rate": 5.186566498544737e-05, "loss": 0.0331, "step": 1438 }, { "epoch": 1.0795198799699925, "grad_norm": 0.15231534838676453, "learning_rate": 5.18002318725477e-05, "loss": 0.0295, "step": 1439 }, { "epoch": 1.0802700675168793, "grad_norm": 0.16985756158828735, "learning_rate": 5.173479567242765e-05, "loss": 0.0426, "step": 1440 }, { "epoch": 1.081020255063766, "grad_norm": 0.19819964468479156, "learning_rate": 5.1669356497303835e-05, "loss": 0.062, "step": 1441 }, { "epoch": 1.0817704426106527, "grad_norm": 0.15997275710105896, "learning_rate": 5.1603914459398016e-05, "loss": 0.0445, "step": 1442 }, { "epoch": 1.0825206301575394, "grad_norm": 0.1522863507270813, "learning_rate": 5.153846967093684e-05, "loss": 0.0602, "step": 1443 }, { "epoch": 1.0832708177044261, "grad_norm": 0.16248387098312378, "learning_rate": 5.1473022244151684e-05, "loss": 0.0386, "step": 1444 }, { "epoch": 1.0840210052513128, "grad_norm": 0.16288204491138458, "learning_rate": 5.140757229127842e-05, "loss": 0.0442, "step": 1445 }, { "epoch": 1.0847711927981996, "grad_norm": 0.150307759642601, "learning_rate": 5.1342119924557275e-05, "loss": 0.0484, "step": 1446 }, { "epoch": 1.0855213803450863, "grad_norm": 0.19405823945999146, "learning_rate": 5.127666525623264e-05, "loss": 0.0516, "step": 1447 }, { "epoch": 1.086271567891973, "grad_norm": 0.20875540375709534, "learning_rate": 5.121120839855279e-05, "loss": 0.0476, "step": 1448 }, { "epoch": 1.0870217554388597, "grad_norm": 0.24014073610305786, "learning_rate": 5.114574946376982e-05, "loss": 0.0563, "step": 1449 }, { "epoch": 1.0877719429857464, "grad_norm": 0.15809394419193268, "learning_rate": 5.1080288564139325e-05, "loss": 0.0455, "step": 1450 }, { "epoch": 1.0885221305326331, "grad_norm": 0.17998574674129486, "learning_rate": 5.101482581192033e-05, "loss": 0.0366, "step": 1451 }, { "epoch": 1.0892723180795199, "grad_norm": 0.16414082050323486, "learning_rate": 5.0949361319374996e-05, "loss": 0.0471, "step": 1452 }, { "epoch": 1.0900225056264066, "grad_norm": 0.18297888338565826, "learning_rate": 5.0883895198768494e-05, "loss": 0.0468, "step": 1453 }, { "epoch": 1.0907726931732933, "grad_norm": 0.11977231502532959, "learning_rate": 5.0818427562368764e-05, "loss": 0.0339, "step": 1454 }, { "epoch": 1.09152288072018, "grad_norm": 0.13561047613620758, "learning_rate": 5.0752958522446356e-05, "loss": 0.0643, "step": 1455 }, { "epoch": 1.0922730682670667, "grad_norm": 0.1655491441488266, "learning_rate": 5.0687488191274215e-05, "loss": 0.0632, "step": 1456 }, { "epoch": 1.0930232558139534, "grad_norm": 0.2185972034931183, "learning_rate": 5.0622016681127526e-05, "loss": 0.0395, "step": 1457 }, { "epoch": 1.0937734433608401, "grad_norm": 0.13363471627235413, "learning_rate": 5.055654410428349e-05, "loss": 0.0413, "step": 1458 }, { "epoch": 1.0945236309077269, "grad_norm": 0.17551277577877045, "learning_rate": 5.0491070573021116e-05, "loss": 0.051, "step": 1459 }, { "epoch": 1.0952738184546136, "grad_norm": 0.16505476832389832, "learning_rate": 5.0425596199621064e-05, "loss": 0.0614, "step": 1460 }, { "epoch": 1.0960240060015003, "grad_norm": 0.12644611299037933, "learning_rate": 5.036012109636543e-05, "loss": 0.0346, "step": 1461 }, { "epoch": 1.096774193548387, "grad_norm": 0.10962353646755219, "learning_rate": 5.0294645375537594e-05, "loss": 0.0376, "step": 1462 }, { "epoch": 1.0975243810952737, "grad_norm": 0.18371117115020752, "learning_rate": 5.022916914942195e-05, "loss": 0.0449, "step": 1463 }, { "epoch": 1.0982745686421604, "grad_norm": 0.22092092037200928, "learning_rate": 5.0163692530303774e-05, "loss": 0.0473, "step": 1464 }, { "epoch": 1.0990247561890472, "grad_norm": 0.18737155199050903, "learning_rate": 5.009821563046903e-05, "loss": 0.0371, "step": 1465 }, { "epoch": 1.0997749437359339, "grad_norm": 0.1893552988767624, "learning_rate": 5.003273856220415e-05, "loss": 0.0457, "step": 1466 }, { "epoch": 1.1005251312828208, "grad_norm": 0.14333701133728027, "learning_rate": 4.996726143779586e-05, "loss": 0.0648, "step": 1467 }, { "epoch": 1.1012753188297075, "grad_norm": 0.17025060951709747, "learning_rate": 4.990178436953099e-05, "loss": 0.0396, "step": 1468 }, { "epoch": 1.1020255063765942, "grad_norm": 0.14343062043190002, "learning_rate": 4.9836307469696244e-05, "loss": 0.0333, "step": 1469 }, { "epoch": 1.102775693923481, "grad_norm": 0.13673539459705353, "learning_rate": 4.9770830850578075e-05, "loss": 0.0391, "step": 1470 }, { "epoch": 1.1035258814703677, "grad_norm": 0.1872038096189499, "learning_rate": 4.9705354624462424e-05, "loss": 0.0645, "step": 1471 }, { "epoch": 1.1042760690172544, "grad_norm": 0.1406128704547882, "learning_rate": 4.963987890363458e-05, "loss": 0.0367, "step": 1472 }, { "epoch": 1.105026256564141, "grad_norm": 0.12920933961868286, "learning_rate": 4.957440380037896e-05, "loss": 0.0391, "step": 1473 }, { "epoch": 1.1057764441110278, "grad_norm": 0.15628013014793396, "learning_rate": 4.9508929426978896e-05, "loss": 0.0442, "step": 1474 }, { "epoch": 1.1065266316579145, "grad_norm": 0.13842900097370148, "learning_rate": 4.944345589571651e-05, "loss": 0.0298, "step": 1475 }, { "epoch": 1.1072768192048013, "grad_norm": 0.2180444896221161, "learning_rate": 4.937798331887248e-05, "loss": 0.0593, "step": 1476 }, { "epoch": 1.108027006751688, "grad_norm": 0.1547778993844986, "learning_rate": 4.931251180872579e-05, "loss": 0.0493, "step": 1477 }, { "epoch": 1.1087771942985747, "grad_norm": 0.2607737183570862, "learning_rate": 4.9247041477553656e-05, "loss": 0.0712, "step": 1478 }, { "epoch": 1.1095273818454614, "grad_norm": 0.17707867920398712, "learning_rate": 4.9181572437631255e-05, "loss": 0.07, "step": 1479 }, { "epoch": 1.1102775693923481, "grad_norm": 0.17139020562171936, "learning_rate": 4.911610480123151e-05, "loss": 0.061, "step": 1480 }, { "epoch": 1.1110277569392348, "grad_norm": 0.19931213557720184, "learning_rate": 4.905063868062501e-05, "loss": 0.0607, "step": 1481 }, { "epoch": 1.1117779444861215, "grad_norm": 0.17375808954238892, "learning_rate": 4.898517418807968e-05, "loss": 0.0444, "step": 1482 }, { "epoch": 1.1125281320330083, "grad_norm": 0.12767089903354645, "learning_rate": 4.891971143586069e-05, "loss": 0.0486, "step": 1483 }, { "epoch": 1.113278319579895, "grad_norm": 0.1534682959318161, "learning_rate": 4.88542505362302e-05, "loss": 0.0458, "step": 1484 }, { "epoch": 1.1140285071267817, "grad_norm": 0.17411962151527405, "learning_rate": 4.878879160144723e-05, "loss": 0.0464, "step": 1485 }, { "epoch": 1.1147786946736684, "grad_norm": 0.12393703311681747, "learning_rate": 4.872333474376739e-05, "loss": 0.0543, "step": 1486 }, { "epoch": 1.1155288822205551, "grad_norm": 0.11996886134147644, "learning_rate": 4.865788007544274e-05, "loss": 0.0248, "step": 1487 }, { "epoch": 1.1162790697674418, "grad_norm": 0.14848671853542328, "learning_rate": 4.859242770872158e-05, "loss": 0.0312, "step": 1488 }, { "epoch": 1.1170292573143286, "grad_norm": 0.16110281646251678, "learning_rate": 4.852697775584833e-05, "loss": 0.04, "step": 1489 }, { "epoch": 1.1177794448612153, "grad_norm": 0.1755913347005844, "learning_rate": 4.846153032906316e-05, "loss": 0.0422, "step": 1490 }, { "epoch": 1.118529632408102, "grad_norm": 0.15683995187282562, "learning_rate": 4.8396085540601995e-05, "loss": 0.0575, "step": 1491 }, { "epoch": 1.1192798199549887, "grad_norm": 0.15512287616729736, "learning_rate": 4.833064350269617e-05, "loss": 0.0551, "step": 1492 }, { "epoch": 1.1200300075018754, "grad_norm": 0.16375933587551117, "learning_rate": 4.826520432757236e-05, "loss": 0.05, "step": 1493 }, { "epoch": 1.1207801950487621, "grad_norm": 0.19421043992042542, "learning_rate": 4.8199768127452314e-05, "loss": 0.0733, "step": 1494 }, { "epoch": 1.1215303825956489, "grad_norm": 0.2077593207359314, "learning_rate": 4.813433501455264e-05, "loss": 0.0483, "step": 1495 }, { "epoch": 1.1222805701425356, "grad_norm": 0.19140911102294922, "learning_rate": 4.806890510108471e-05, "loss": 0.0659, "step": 1496 }, { "epoch": 1.1230307576894223, "grad_norm": 0.23248641192913055, "learning_rate": 4.800347849925437e-05, "loss": 0.0512, "step": 1497 }, { "epoch": 1.123780945236309, "grad_norm": 0.20153175294399261, "learning_rate": 4.793805532126181e-05, "loss": 0.0604, "step": 1498 }, { "epoch": 1.1245311327831957, "grad_norm": 0.17858366668224335, "learning_rate": 4.787263567930132e-05, "loss": 0.0291, "step": 1499 }, { "epoch": 1.1252813203300824, "grad_norm": 0.16331790387630463, "learning_rate": 4.780721968556115e-05, "loss": 0.0485, "step": 1500 }, { "epoch": 1.1260315078769692, "grad_norm": 0.15837393701076508, "learning_rate": 4.774180745222331e-05, "loss": 0.0385, "step": 1501 }, { "epoch": 1.1267816954238559, "grad_norm": 0.13282233476638794, "learning_rate": 4.7676399091463296e-05, "loss": 0.0324, "step": 1502 }, { "epoch": 1.1275318829707426, "grad_norm": 0.26786598563194275, "learning_rate": 4.7610994715450044e-05, "loss": 0.0394, "step": 1503 }, { "epoch": 1.1282820705176295, "grad_norm": 0.19126728177070618, "learning_rate": 4.754559443634561e-05, "loss": 0.0477, "step": 1504 }, { "epoch": 1.129032258064516, "grad_norm": 0.1592726856470108, "learning_rate": 4.748019836630503e-05, "loss": 0.0593, "step": 1505 }, { "epoch": 1.129782445611403, "grad_norm": 0.19169463217258453, "learning_rate": 4.7414806617476124e-05, "loss": 0.0602, "step": 1506 }, { "epoch": 1.1305326331582897, "grad_norm": 0.2752186357975006, "learning_rate": 4.7349419301999294e-05, "loss": 0.071, "step": 1507 }, { "epoch": 1.1312828207051764, "grad_norm": 0.22131672501564026, "learning_rate": 4.7284036532007356e-05, "loss": 0.082, "step": 1508 }, { "epoch": 1.132033008252063, "grad_norm": 0.15666484832763672, "learning_rate": 4.721865841962533e-05, "loss": 0.0487, "step": 1509 }, { "epoch": 1.1327831957989498, "grad_norm": 0.11834803223609924, "learning_rate": 4.715328507697021e-05, "loss": 0.0524, "step": 1510 }, { "epoch": 1.1335333833458365, "grad_norm": 0.19373413920402527, "learning_rate": 4.7087916616150886e-05, "loss": 0.0618, "step": 1511 }, { "epoch": 1.1342835708927232, "grad_norm": 0.15369850397109985, "learning_rate": 4.702255314926779e-05, "loss": 0.0386, "step": 1512 }, { "epoch": 1.13503375843961, "grad_norm": 0.2039160132408142, "learning_rate": 4.695719478841286e-05, "loss": 0.0641, "step": 1513 }, { "epoch": 1.1357839459864967, "grad_norm": 0.1577986180782318, "learning_rate": 4.6891841645669224e-05, "loss": 0.0466, "step": 1514 }, { "epoch": 1.1365341335333834, "grad_norm": 0.17635010182857513, "learning_rate": 4.6826493833111104e-05, "loss": 0.0432, "step": 1515 }, { "epoch": 1.13728432108027, "grad_norm": 0.1743796020746231, "learning_rate": 4.676115146280356e-05, "loss": 0.0405, "step": 1516 }, { "epoch": 1.1380345086271568, "grad_norm": 0.1372276097536087, "learning_rate": 4.669581464680233e-05, "loss": 0.0345, "step": 1517 }, { "epoch": 1.1387846961740435, "grad_norm": 0.1724540740251541, "learning_rate": 4.66304834971536e-05, "loss": 0.0515, "step": 1518 }, { "epoch": 1.1395348837209303, "grad_norm": 0.10708152502775192, "learning_rate": 4.656515812589389e-05, "loss": 0.0317, "step": 1519 }, { "epoch": 1.140285071267817, "grad_norm": 0.1259574443101883, "learning_rate": 4.6499838645049744e-05, "loss": 0.0436, "step": 1520 }, { "epoch": 1.1410352588147037, "grad_norm": 0.1755182147026062, "learning_rate": 4.643452516663766e-05, "loss": 0.0411, "step": 1521 }, { "epoch": 1.1417854463615904, "grad_norm": 0.1899542510509491, "learning_rate": 4.636921780266381e-05, "loss": 0.0617, "step": 1522 }, { "epoch": 1.1425356339084771, "grad_norm": 0.1305563747882843, "learning_rate": 4.63039166651239e-05, "loss": 0.0475, "step": 1523 }, { "epoch": 1.1432858214553638, "grad_norm": 0.11840610206127167, "learning_rate": 4.623862186600297e-05, "loss": 0.0378, "step": 1524 }, { "epoch": 1.1440360090022506, "grad_norm": 0.13866670429706573, "learning_rate": 4.617333351727513e-05, "loss": 0.0361, "step": 1525 }, { "epoch": 1.1447861965491373, "grad_norm": 0.2099863588809967, "learning_rate": 4.61080517309035e-05, "loss": 0.0741, "step": 1526 }, { "epoch": 1.145536384096024, "grad_norm": 0.21586652100086212, "learning_rate": 4.604277661883989e-05, "loss": 0.058, "step": 1527 }, { "epoch": 1.1462865716429107, "grad_norm": 0.09848996251821518, "learning_rate": 4.5977508293024726e-05, "loss": 0.0238, "step": 1528 }, { "epoch": 1.1470367591897974, "grad_norm": 0.14757753908634186, "learning_rate": 4.591224686538672e-05, "loss": 0.0321, "step": 1529 }, { "epoch": 1.1477869467366841, "grad_norm": 0.18732796609401703, "learning_rate": 4.584699244784281e-05, "loss": 0.0584, "step": 1530 }, { "epoch": 1.1485371342835708, "grad_norm": 0.21146154403686523, "learning_rate": 4.578174515229789e-05, "loss": 0.0973, "step": 1531 }, { "epoch": 1.1492873218304576, "grad_norm": 0.1362430453300476, "learning_rate": 4.5716505090644684e-05, "loss": 0.0449, "step": 1532 }, { "epoch": 1.1500375093773443, "grad_norm": 0.1803388148546219, "learning_rate": 4.5651272374763423e-05, "loss": 0.0354, "step": 1533 }, { "epoch": 1.150787696924231, "grad_norm": 0.1937418282032013, "learning_rate": 4.558604711652183e-05, "loss": 0.0531, "step": 1534 }, { "epoch": 1.1515378844711177, "grad_norm": 0.15416602790355682, "learning_rate": 4.552082942777478e-05, "loss": 0.04, "step": 1535 }, { "epoch": 1.1522880720180044, "grad_norm": 0.1575358361005783, "learning_rate": 4.545561942036418e-05, "loss": 0.0378, "step": 1536 }, { "epoch": 1.1530382595648911, "grad_norm": 0.171066552400589, "learning_rate": 4.5390417206118784e-05, "loss": 0.054, "step": 1537 }, { "epoch": 1.1537884471117779, "grad_norm": 0.15054123103618622, "learning_rate": 4.5325222896853966e-05, "loss": 0.0647, "step": 1538 }, { "epoch": 1.1545386346586646, "grad_norm": 0.23045825958251953, "learning_rate": 4.5260036604371556e-05, "loss": 0.076, "step": 1539 }, { "epoch": 1.1552888222055513, "grad_norm": 0.17275682091712952, "learning_rate": 4.51948584404596e-05, "loss": 0.042, "step": 1540 }, { "epoch": 1.1560390097524382, "grad_norm": 0.15596450865268707, "learning_rate": 4.5129688516892264e-05, "loss": 0.0429, "step": 1541 }, { "epoch": 1.1567891972993247, "grad_norm": 0.16053304076194763, "learning_rate": 4.506452694542953e-05, "loss": 0.0463, "step": 1542 }, { "epoch": 1.1575393848462117, "grad_norm": 0.1733517348766327, "learning_rate": 4.499937383781708e-05, "loss": 0.0432, "step": 1543 }, { "epoch": 1.1582895723930982, "grad_norm": 0.14224635064601898, "learning_rate": 4.493422930578605e-05, "loss": 0.0334, "step": 1544 }, { "epoch": 1.159039759939985, "grad_norm": 0.1554994285106659, "learning_rate": 4.486909346105296e-05, "loss": 0.0477, "step": 1545 }, { "epoch": 1.1597899474868718, "grad_norm": 0.1568094938993454, "learning_rate": 4.480396641531932e-05, "loss": 0.0415, "step": 1546 }, { "epoch": 1.1605401350337585, "grad_norm": 0.15656742453575134, "learning_rate": 4.4738848280271626e-05, "loss": 0.0466, "step": 1547 }, { "epoch": 1.1612903225806452, "grad_norm": 0.10462535917758942, "learning_rate": 4.467373916758105e-05, "loss": 0.0191, "step": 1548 }, { "epoch": 1.162040510127532, "grad_norm": 0.2560919225215912, "learning_rate": 4.460863918890333e-05, "loss": 0.0794, "step": 1549 }, { "epoch": 1.1627906976744187, "grad_norm": 0.1778658926486969, "learning_rate": 4.454354845587849e-05, "loss": 0.0448, "step": 1550 }, { "epoch": 1.1635408852213054, "grad_norm": 0.13617105782032013, "learning_rate": 4.4478467080130734e-05, "loss": 0.0437, "step": 1551 }, { "epoch": 1.164291072768192, "grad_norm": 0.22036074101924896, "learning_rate": 4.4413395173268243e-05, "loss": 0.0498, "step": 1552 }, { "epoch": 1.1650412603150788, "grad_norm": 0.23211778700351715, "learning_rate": 4.43483328468829e-05, "loss": 0.0546, "step": 1553 }, { "epoch": 1.1657914478619655, "grad_norm": 0.18650752305984497, "learning_rate": 4.4283280212550194e-05, "loss": 0.0471, "step": 1554 }, { "epoch": 1.1665416354088523, "grad_norm": 0.14625145494937897, "learning_rate": 4.421823738182898e-05, "loss": 0.0341, "step": 1555 }, { "epoch": 1.167291822955739, "grad_norm": 0.17568133771419525, "learning_rate": 4.4153204466261334e-05, "loss": 0.0489, "step": 1556 }, { "epoch": 1.1680420105026257, "grad_norm": 0.15340299904346466, "learning_rate": 4.408818157737227e-05, "loss": 0.0426, "step": 1557 }, { "epoch": 1.1687921980495124, "grad_norm": 0.17339184880256653, "learning_rate": 4.402316882666964e-05, "loss": 0.0457, "step": 1558 }, { "epoch": 1.1695423855963991, "grad_norm": 0.25148463249206543, "learning_rate": 4.395816632564393e-05, "loss": 0.0479, "step": 1559 }, { "epoch": 1.1702925731432858, "grad_norm": 0.1664632111787796, "learning_rate": 4.3893174185768045e-05, "loss": 0.0466, "step": 1560 }, { "epoch": 1.1710427606901725, "grad_norm": 0.14164632558822632, "learning_rate": 4.382819251849707e-05, "loss": 0.0288, "step": 1561 }, { "epoch": 1.1717929482370593, "grad_norm": 0.11925794184207916, "learning_rate": 4.376322143526818e-05, "loss": 0.0349, "step": 1562 }, { "epoch": 1.172543135783946, "grad_norm": 0.1766175925731659, "learning_rate": 4.3698261047500376e-05, "loss": 0.0427, "step": 1563 }, { "epoch": 1.1732933233308327, "grad_norm": 0.18459202349185944, "learning_rate": 4.3633311466594345e-05, "loss": 0.0401, "step": 1564 }, { "epoch": 1.1740435108777194, "grad_norm": 0.20463117957115173, "learning_rate": 4.3568372803932195e-05, "loss": 0.0382, "step": 1565 }, { "epoch": 1.1747936984246061, "grad_norm": 0.20283189415931702, "learning_rate": 4.3503445170877354e-05, "loss": 0.0423, "step": 1566 }, { "epoch": 1.1755438859714928, "grad_norm": 0.1398739516735077, "learning_rate": 4.343852867877433e-05, "loss": 0.0371, "step": 1567 }, { "epoch": 1.1762940735183796, "grad_norm": 0.19692690670490265, "learning_rate": 4.3373623438948496e-05, "loss": 0.044, "step": 1568 }, { "epoch": 1.1770442610652663, "grad_norm": 0.11689583212137222, "learning_rate": 4.330872956270596e-05, "loss": 0.0393, "step": 1569 }, { "epoch": 1.177794448612153, "grad_norm": 0.17780661582946777, "learning_rate": 4.324384716133332e-05, "loss": 0.0449, "step": 1570 }, { "epoch": 1.1785446361590397, "grad_norm": 0.2003779113292694, "learning_rate": 4.317897634609751e-05, "loss": 0.0537, "step": 1571 }, { "epoch": 1.1792948237059264, "grad_norm": 0.20001736283302307, "learning_rate": 4.3114117228245565e-05, "loss": 0.0445, "step": 1572 }, { "epoch": 1.1800450112528131, "grad_norm": 0.19485583901405334, "learning_rate": 4.304926991900453e-05, "loss": 0.0503, "step": 1573 }, { "epoch": 1.1807951987996999, "grad_norm": 0.20719118416309357, "learning_rate": 4.298443452958113e-05, "loss": 0.0432, "step": 1574 }, { "epoch": 1.1815453863465866, "grad_norm": 0.21073326468467712, "learning_rate": 4.291961117116168e-05, "loss": 0.0429, "step": 1575 }, { "epoch": 1.1822955738934733, "grad_norm": 0.11755035817623138, "learning_rate": 4.285479995491185e-05, "loss": 0.03, "step": 1576 }, { "epoch": 1.18304576144036, "grad_norm": 0.1673261970281601, "learning_rate": 4.279000099197651e-05, "loss": 0.0404, "step": 1577 }, { "epoch": 1.1837959489872467, "grad_norm": 0.13132010400295258, "learning_rate": 4.272521439347947e-05, "loss": 0.0326, "step": 1578 }, { "epoch": 1.1845461365341334, "grad_norm": 0.18214571475982666, "learning_rate": 4.26604402705234e-05, "loss": 0.0395, "step": 1579 }, { "epoch": 1.1852963240810204, "grad_norm": 0.15369313955307007, "learning_rate": 4.259567873418952e-05, "loss": 0.0404, "step": 1580 }, { "epoch": 1.1860465116279069, "grad_norm": 0.21026532351970673, "learning_rate": 4.25309298955375e-05, "loss": 0.0619, "step": 1581 }, { "epoch": 1.1867966991747938, "grad_norm": 0.2855604290962219, "learning_rate": 4.246619386560521e-05, "loss": 0.0477, "step": 1582 }, { "epoch": 1.1875468867216805, "grad_norm": 0.10382580757141113, "learning_rate": 4.240147075540858e-05, "loss": 0.0271, "step": 1583 }, { "epoch": 1.1882970742685672, "grad_norm": 0.11604207754135132, "learning_rate": 4.233676067594137e-05, "loss": 0.0383, "step": 1584 }, { "epoch": 1.189047261815454, "grad_norm": 0.18044514954090118, "learning_rate": 4.227206373817497e-05, "loss": 0.0517, "step": 1585 }, { "epoch": 1.1897974493623407, "grad_norm": 0.1902741938829422, "learning_rate": 4.220738005305827e-05, "loss": 0.0564, "step": 1586 }, { "epoch": 1.1905476369092274, "grad_norm": 0.1472162902355194, "learning_rate": 4.214270973151745e-05, "loss": 0.0502, "step": 1587 }, { "epoch": 1.191297824456114, "grad_norm": 0.17314302921295166, "learning_rate": 4.207805288445571e-05, "loss": 0.0493, "step": 1588 }, { "epoch": 1.1920480120030008, "grad_norm": 0.17650650441646576, "learning_rate": 4.201340962275318e-05, "loss": 0.0666, "step": 1589 }, { "epoch": 1.1927981995498875, "grad_norm": 0.12752214074134827, "learning_rate": 4.194878005726671e-05, "loss": 0.0296, "step": 1590 }, { "epoch": 1.1935483870967742, "grad_norm": 0.16013704240322113, "learning_rate": 4.1884164298829615e-05, "loss": 0.0344, "step": 1591 }, { "epoch": 1.194298574643661, "grad_norm": 0.1943400651216507, "learning_rate": 4.181956245825158e-05, "loss": 0.0401, "step": 1592 }, { "epoch": 1.1950487621905477, "grad_norm": 0.1719129979610443, "learning_rate": 4.1754974646318365e-05, "loss": 0.0461, "step": 1593 }, { "epoch": 1.1957989497374344, "grad_norm": 0.1805904060602188, "learning_rate": 4.1690400973791756e-05, "loss": 0.0496, "step": 1594 }, { "epoch": 1.196549137284321, "grad_norm": 0.1407364308834076, "learning_rate": 4.1625841551409195e-05, "loss": 0.045, "step": 1595 }, { "epoch": 1.1972993248312078, "grad_norm": 0.21922746300697327, "learning_rate": 4.156129648988376e-05, "loss": 0.0575, "step": 1596 }, { "epoch": 1.1980495123780945, "grad_norm": 0.08665847778320312, "learning_rate": 4.149676589990388e-05, "loss": 0.0275, "step": 1597 }, { "epoch": 1.1987996999249813, "grad_norm": 0.30365556478500366, "learning_rate": 4.143224989213315e-05, "loss": 0.0697, "step": 1598 }, { "epoch": 1.199549887471868, "grad_norm": 0.1910962462425232, "learning_rate": 4.136774857721017e-05, "loss": 0.0534, "step": 1599 }, { "epoch": 1.2003000750187547, "grad_norm": 0.14763104915618896, "learning_rate": 4.130326206574834e-05, "loss": 0.0486, "step": 1600 }, { "epoch": 1.2003000750187547, "eval_loss": 0.06795958429574966, "eval_runtime": 2.6514, "eval_samples_per_second": 20.366, "eval_steps_per_second": 5.28, "step": 1600 }, { "epoch": 1.2010502625656414, "grad_norm": 0.14768727123737335, "learning_rate": 4.1238790468335685e-05, "loss": 0.0339, "step": 1601 }, { "epoch": 1.2018004501125281, "grad_norm": 0.17933112382888794, "learning_rate": 4.117433389553466e-05, "loss": 0.041, "step": 1602 }, { "epoch": 1.2025506376594148, "grad_norm": 0.1807253360748291, "learning_rate": 4.1109892457881924e-05, "loss": 0.049, "step": 1603 }, { "epoch": 1.2033008252063015, "grad_norm": 0.13722018897533417, "learning_rate": 4.1045466265888195e-05, "loss": 0.0468, "step": 1604 }, { "epoch": 1.2040510127531883, "grad_norm": 0.18306781351566315, "learning_rate": 4.0981055430038055e-05, "loss": 0.0366, "step": 1605 }, { "epoch": 1.204801200300075, "grad_norm": 0.17573761940002441, "learning_rate": 4.091666006078974e-05, "loss": 0.0466, "step": 1606 }, { "epoch": 1.2055513878469617, "grad_norm": 0.15146800875663757, "learning_rate": 4.085228026857498e-05, "loss": 0.0456, "step": 1607 }, { "epoch": 1.2063015753938484, "grad_norm": 0.18410256505012512, "learning_rate": 4.0787916163798743e-05, "loss": 0.0438, "step": 1608 }, { "epoch": 1.2070517629407351, "grad_norm": 0.24622641503810883, "learning_rate": 4.0723567856839184e-05, "loss": 0.073, "step": 1609 }, { "epoch": 1.2078019504876218, "grad_norm": 0.15817995369434357, "learning_rate": 4.0659235458047264e-05, "loss": 0.0362, "step": 1610 }, { "epoch": 1.2085521380345086, "grad_norm": 0.1328483670949936, "learning_rate": 4.0594919077746734e-05, "loss": 0.0281, "step": 1611 }, { "epoch": 1.2093023255813953, "grad_norm": 0.2026902437210083, "learning_rate": 4.053061882623386e-05, "loss": 0.0549, "step": 1612 }, { "epoch": 1.210052513128282, "grad_norm": 0.15139414370059967, "learning_rate": 4.0466334813777216e-05, "loss": 0.0439, "step": 1613 }, { "epoch": 1.2108027006751687, "grad_norm": 0.11512304097414017, "learning_rate": 4.040206715061758e-05, "loss": 0.0335, "step": 1614 }, { "epoch": 1.2115528882220554, "grad_norm": 0.1752549260854721, "learning_rate": 4.033781594696767e-05, "loss": 0.0481, "step": 1615 }, { "epoch": 1.2123030757689421, "grad_norm": 0.13294178247451782, "learning_rate": 4.027358131301194e-05, "loss": 0.03, "step": 1616 }, { "epoch": 1.213053263315829, "grad_norm": 0.3617671728134155, "learning_rate": 4.0209363358906495e-05, "loss": 0.0489, "step": 1617 }, { "epoch": 1.2138034508627156, "grad_norm": 0.20684273540973663, "learning_rate": 4.014516219477878e-05, "loss": 0.0548, "step": 1618 }, { "epoch": 1.2145536384096025, "grad_norm": 0.16169099509716034, "learning_rate": 4.008097793072749e-05, "loss": 0.0528, "step": 1619 }, { "epoch": 1.215303825956489, "grad_norm": 0.16483749449253082, "learning_rate": 4.00168106768223e-05, "loss": 0.055, "step": 1620 }, { "epoch": 1.216054013503376, "grad_norm": 0.11685378104448318, "learning_rate": 3.9952660543103734e-05, "loss": 0.0384, "step": 1621 }, { "epoch": 1.2168042010502627, "grad_norm": 0.2155417501926422, "learning_rate": 3.988852763958297e-05, "loss": 0.0437, "step": 1622 }, { "epoch": 1.2175543885971494, "grad_norm": 0.21236859261989594, "learning_rate": 3.9824412076241595e-05, "loss": 0.0675, "step": 1623 }, { "epoch": 1.218304576144036, "grad_norm": 0.18504956364631653, "learning_rate": 3.9760313963031516e-05, "loss": 0.0522, "step": 1624 }, { "epoch": 1.2190547636909228, "grad_norm": 0.12959396839141846, "learning_rate": 3.9696233409874654e-05, "loss": 0.0456, "step": 1625 }, { "epoch": 1.2198049512378095, "grad_norm": 0.1511463224887848, "learning_rate": 3.963217052666287e-05, "loss": 0.0311, "step": 1626 }, { "epoch": 1.2205551387846962, "grad_norm": 0.1537894457578659, "learning_rate": 3.956812542325769e-05, "loss": 0.0418, "step": 1627 }, { "epoch": 1.221305326331583, "grad_norm": 0.19139213860034943, "learning_rate": 3.950409820949018e-05, "loss": 0.0635, "step": 1628 }, { "epoch": 1.2220555138784697, "grad_norm": 0.28957399725914, "learning_rate": 3.9440088995160676e-05, "loss": 0.0867, "step": 1629 }, { "epoch": 1.2228057014253564, "grad_norm": 0.15784677863121033, "learning_rate": 3.937609789003871e-05, "loss": 0.0456, "step": 1630 }, { "epoch": 1.223555888972243, "grad_norm": 0.11543436348438263, "learning_rate": 3.93121250038627e-05, "loss": 0.0415, "step": 1631 }, { "epoch": 1.2243060765191298, "grad_norm": 0.12811176478862762, "learning_rate": 3.924817044633985e-05, "loss": 0.0303, "step": 1632 }, { "epoch": 1.2250562640660165, "grad_norm": 0.1936388462781906, "learning_rate": 3.9184234327145954e-05, "loss": 0.0455, "step": 1633 }, { "epoch": 1.2258064516129032, "grad_norm": 0.14532004296779633, "learning_rate": 3.912031675592512e-05, "loss": 0.0456, "step": 1634 }, { "epoch": 1.22655663915979, "grad_norm": 0.1502491980791092, "learning_rate": 3.905641784228972e-05, "loss": 0.029, "step": 1635 }, { "epoch": 1.2273068267066767, "grad_norm": 0.17692214250564575, "learning_rate": 3.899253769582008e-05, "loss": 0.0432, "step": 1636 }, { "epoch": 1.2280570142535634, "grad_norm": 0.18477126955986023, "learning_rate": 3.8928676426064376e-05, "loss": 0.0387, "step": 1637 }, { "epoch": 1.22880720180045, "grad_norm": 0.2677137851715088, "learning_rate": 3.886483414253838e-05, "loss": 0.0412, "step": 1638 }, { "epoch": 1.2295573893473368, "grad_norm": 0.1280679851770401, "learning_rate": 3.880101095472535e-05, "loss": 0.0333, "step": 1639 }, { "epoch": 1.2303075768942235, "grad_norm": 0.18574625253677368, "learning_rate": 3.873720697207572e-05, "loss": 0.0414, "step": 1640 }, { "epoch": 1.2310577644411103, "grad_norm": 0.13856203854084015, "learning_rate": 3.867342230400707e-05, "loss": 0.0311, "step": 1641 }, { "epoch": 1.231807951987997, "grad_norm": 0.13447070121765137, "learning_rate": 3.860965705990383e-05, "loss": 0.0324, "step": 1642 }, { "epoch": 1.2325581395348837, "grad_norm": 0.24870045483112335, "learning_rate": 3.8545911349117114e-05, "loss": 0.0619, "step": 1643 }, { "epoch": 1.2333083270817704, "grad_norm": 0.1902378350496292, "learning_rate": 3.848218528096452e-05, "loss": 0.0339, "step": 1644 }, { "epoch": 1.2340585146286571, "grad_norm": 0.18414323031902313, "learning_rate": 3.841847896473001e-05, "loss": 0.0388, "step": 1645 }, { "epoch": 1.2348087021755438, "grad_norm": 0.14399470388889313, "learning_rate": 3.83547925096636e-05, "loss": 0.0362, "step": 1646 }, { "epoch": 1.2355588897224306, "grad_norm": 0.1719599962234497, "learning_rate": 3.829112602498132e-05, "loss": 0.0626, "step": 1647 }, { "epoch": 1.2363090772693173, "grad_norm": 0.1415622979402542, "learning_rate": 3.822747961986493e-05, "loss": 0.0371, "step": 1648 }, { "epoch": 1.237059264816204, "grad_norm": 0.15044745802879333, "learning_rate": 3.816385340346171e-05, "loss": 0.0311, "step": 1649 }, { "epoch": 1.2378094523630907, "grad_norm": 0.18542815744876862, "learning_rate": 3.81002474848844e-05, "loss": 0.0399, "step": 1650 }, { "epoch": 1.2385596399099774, "grad_norm": 0.17152507603168488, "learning_rate": 3.803666197321084e-05, "loss": 0.0436, "step": 1651 }, { "epoch": 1.2393098274568641, "grad_norm": 0.19231060147285461, "learning_rate": 3.797309697748396e-05, "loss": 0.0457, "step": 1652 }, { "epoch": 1.2400600150037508, "grad_norm": 0.1155783087015152, "learning_rate": 3.7909552606711454e-05, "loss": 0.0245, "step": 1653 }, { "epoch": 1.2408102025506376, "grad_norm": 0.14751829206943512, "learning_rate": 3.784602896986566e-05, "loss": 0.0289, "step": 1654 }, { "epoch": 1.2415603900975243, "grad_norm": 0.17149440944194794, "learning_rate": 3.778252617588334e-05, "loss": 0.0265, "step": 1655 }, { "epoch": 1.2423105776444112, "grad_norm": 0.16274116933345795, "learning_rate": 3.771904433366557e-05, "loss": 0.0379, "step": 1656 }, { "epoch": 1.2430607651912977, "grad_norm": 0.17734788358211517, "learning_rate": 3.7655583552077446e-05, "loss": 0.0609, "step": 1657 }, { "epoch": 1.2438109527381846, "grad_norm": 0.16764874756336212, "learning_rate": 3.7592143939947955e-05, "loss": 0.0342, "step": 1658 }, { "epoch": 1.2445611402850711, "grad_norm": 0.14420807361602783, "learning_rate": 3.7528725606069774e-05, "loss": 0.0497, "step": 1659 }, { "epoch": 1.245311327831958, "grad_norm": 0.17944395542144775, "learning_rate": 3.746532865919913e-05, "loss": 0.0358, "step": 1660 }, { "epoch": 1.2460615153788448, "grad_norm": 0.18467660248279572, "learning_rate": 3.740195320805551e-05, "loss": 0.0375, "step": 1661 }, { "epoch": 1.2468117029257315, "grad_norm": 0.23316818475723267, "learning_rate": 3.733859936132158e-05, "loss": 0.0444, "step": 1662 }, { "epoch": 1.2475618904726182, "grad_norm": 0.21662329137325287, "learning_rate": 3.727526722764297e-05, "loss": 0.0542, "step": 1663 }, { "epoch": 1.248312078019505, "grad_norm": 0.15543553233146667, "learning_rate": 3.7211956915628035e-05, "loss": 0.0433, "step": 1664 }, { "epoch": 1.2490622655663917, "grad_norm": 0.1937057077884674, "learning_rate": 3.7148668533847744e-05, "loss": 0.05, "step": 1665 }, { "epoch": 1.2498124531132784, "grad_norm": 0.18624050915241241, "learning_rate": 3.7085402190835406e-05, "loss": 0.0468, "step": 1666 }, { "epoch": 1.250562640660165, "grad_norm": 0.2042441964149475, "learning_rate": 3.702215799508659e-05, "loss": 0.0623, "step": 1667 }, { "epoch": 1.2513128282070518, "grad_norm": 0.15845929086208344, "learning_rate": 3.695893605505887e-05, "loss": 0.0487, "step": 1668 }, { "epoch": 1.2520630157539385, "grad_norm": 0.13065217435359955, "learning_rate": 3.689573647917162e-05, "loss": 0.039, "step": 1669 }, { "epoch": 1.2528132033008252, "grad_norm": 0.2207261621952057, "learning_rate": 3.683255937580592e-05, "loss": 0.0476, "step": 1670 }, { "epoch": 1.253563390847712, "grad_norm": 0.18256017565727234, "learning_rate": 3.6769404853304276e-05, "loss": 0.0377, "step": 1671 }, { "epoch": 1.2543135783945987, "grad_norm": 0.15684552490711212, "learning_rate": 3.670627301997047e-05, "loss": 0.0527, "step": 1672 }, { "epoch": 1.2550637659414854, "grad_norm": 0.13574464619159698, "learning_rate": 3.664316398406939e-05, "loss": 0.0327, "step": 1673 }, { "epoch": 1.255813953488372, "grad_norm": 0.14744171500205994, "learning_rate": 3.658007785382679e-05, "loss": 0.0409, "step": 1674 }, { "epoch": 1.2565641410352588, "grad_norm": 0.1688900589942932, "learning_rate": 3.65170147374292e-05, "loss": 0.0461, "step": 1675 }, { "epoch": 1.2573143285821455, "grad_norm": 0.14750975370407104, "learning_rate": 3.645397474302363e-05, "loss": 0.039, "step": 1676 }, { "epoch": 1.2580645161290323, "grad_norm": 0.18498747050762177, "learning_rate": 3.639095797871748e-05, "loss": 0.0432, "step": 1677 }, { "epoch": 1.258814703675919, "grad_norm": 0.19537626206874847, "learning_rate": 3.63279645525783e-05, "loss": 0.0564, "step": 1678 }, { "epoch": 1.2595648912228057, "grad_norm": 0.11299839615821838, "learning_rate": 3.626499457263359e-05, "loss": 0.0265, "step": 1679 }, { "epoch": 1.2603150787696924, "grad_norm": 0.20593377947807312, "learning_rate": 3.620204814687069e-05, "loss": 0.0464, "step": 1680 }, { "epoch": 1.2610652663165791, "grad_norm": 0.16698966920375824, "learning_rate": 3.61391253832365e-05, "loss": 0.0484, "step": 1681 }, { "epoch": 1.2618154538634658, "grad_norm": 0.18022559583187103, "learning_rate": 3.607622638963739e-05, "loss": 0.0477, "step": 1682 }, { "epoch": 1.2625656414103525, "grad_norm": 0.18286746740341187, "learning_rate": 3.601335127393889e-05, "loss": 0.0473, "step": 1683 }, { "epoch": 1.2633158289572393, "grad_norm": 0.20090457797050476, "learning_rate": 3.59505001439657e-05, "loss": 0.0599, "step": 1684 }, { "epoch": 1.264066016504126, "grad_norm": 0.2767641544342041, "learning_rate": 3.588767310750127e-05, "loss": 0.0922, "step": 1685 }, { "epoch": 1.2648162040510127, "grad_norm": 0.2000061422586441, "learning_rate": 3.5824870272287815e-05, "loss": 0.0663, "step": 1686 }, { "epoch": 1.2655663915978994, "grad_norm": 0.2778478264808655, "learning_rate": 3.576209174602597e-05, "loss": 0.0472, "step": 1687 }, { "epoch": 1.2663165791447861, "grad_norm": 0.19717861711978912, "learning_rate": 3.569933763637477e-05, "loss": 0.0452, "step": 1688 }, { "epoch": 1.2670667666916728, "grad_norm": 0.11952456831932068, "learning_rate": 3.56366080509513e-05, "loss": 0.0273, "step": 1689 }, { "epoch": 1.2678169542385596, "grad_norm": 0.22591383755207062, "learning_rate": 3.557390309733065e-05, "loss": 0.0721, "step": 1690 }, { "epoch": 1.2685671417854465, "grad_norm": 0.1697269082069397, "learning_rate": 3.551122288304561e-05, "loss": 0.0456, "step": 1691 }, { "epoch": 1.269317329332333, "grad_norm": 0.1867348998785019, "learning_rate": 3.544856751558659e-05, "loss": 0.0837, "step": 1692 }, { "epoch": 1.27006751687922, "grad_norm": 0.16158032417297363, "learning_rate": 3.538593710240139e-05, "loss": 0.0304, "step": 1693 }, { "epoch": 1.2708177044261064, "grad_norm": 0.15402421355247498, "learning_rate": 3.532333175089498e-05, "loss": 0.0466, "step": 1694 }, { "epoch": 1.2715678919729934, "grad_norm": 0.18709032237529755, "learning_rate": 3.526075156842938e-05, "loss": 0.0432, "step": 1695 }, { "epoch": 1.2723180795198799, "grad_norm": 0.23994222283363342, "learning_rate": 3.519819666232345e-05, "loss": 0.0497, "step": 1696 }, { "epoch": 1.2730682670667668, "grad_norm": 0.12393349409103394, "learning_rate": 3.5135667139852654e-05, "loss": 0.0319, "step": 1697 }, { "epoch": 1.2738184546136533, "grad_norm": 0.17412930727005005, "learning_rate": 3.507316310824902e-05, "loss": 0.0488, "step": 1698 }, { "epoch": 1.2745686421605402, "grad_norm": 0.1547398716211319, "learning_rate": 3.50106846747008e-05, "loss": 0.045, "step": 1699 }, { "epoch": 1.275318829707427, "grad_norm": 0.18785391747951508, "learning_rate": 3.4948231946352314e-05, "loss": 0.0541, "step": 1700 }, { "epoch": 1.2760690172543137, "grad_norm": 0.11669635027647018, "learning_rate": 3.488580503030389e-05, "loss": 0.0396, "step": 1701 }, { "epoch": 1.2768192048012004, "grad_norm": 0.16712819039821625, "learning_rate": 3.482340403361151e-05, "loss": 0.0411, "step": 1702 }, { "epoch": 1.277569392348087, "grad_norm": 0.17171481251716614, "learning_rate": 3.4761029063286745e-05, "loss": 0.0426, "step": 1703 }, { "epoch": 1.2783195798949738, "grad_norm": 0.14547479152679443, "learning_rate": 3.4698680226296526e-05, "loss": 0.0384, "step": 1704 }, { "epoch": 1.2790697674418605, "grad_norm": 0.18368588387966156, "learning_rate": 3.4636357629562986e-05, "loss": 0.05, "step": 1705 }, { "epoch": 1.2798199549887472, "grad_norm": 0.17419405281543732, "learning_rate": 3.457406137996321e-05, "loss": 0.052, "step": 1706 }, { "epoch": 1.280570142535634, "grad_norm": 0.13105162978172302, "learning_rate": 3.4511791584329154e-05, "loss": 0.0368, "step": 1707 }, { "epoch": 1.2813203300825207, "grad_norm": 0.1408979743719101, "learning_rate": 3.4449548349447394e-05, "loss": 0.0459, "step": 1708 }, { "epoch": 1.2820705176294074, "grad_norm": 0.15759265422821045, "learning_rate": 3.438733178205892e-05, "loss": 0.0293, "step": 1709 }, { "epoch": 1.282820705176294, "grad_norm": 0.18341770768165588, "learning_rate": 3.4325141988859046e-05, "loss": 0.0572, "step": 1710 }, { "epoch": 1.2835708927231808, "grad_norm": 0.1435495913028717, "learning_rate": 3.426297907649711e-05, "loss": 0.028, "step": 1711 }, { "epoch": 1.2843210802700675, "grad_norm": 0.1692110002040863, "learning_rate": 3.4200843151576414e-05, "loss": 0.0424, "step": 1712 }, { "epoch": 1.2850712678169542, "grad_norm": 0.13991141319274902, "learning_rate": 3.413873432065394e-05, "loss": 0.0439, "step": 1713 }, { "epoch": 1.285821455363841, "grad_norm": 0.1332203447818756, "learning_rate": 3.407665269024024e-05, "loss": 0.0324, "step": 1714 }, { "epoch": 1.2865716429107277, "grad_norm": 0.26455023884773254, "learning_rate": 3.401459836679917e-05, "loss": 0.0664, "step": 1715 }, { "epoch": 1.2873218304576144, "grad_norm": 0.14125743508338928, "learning_rate": 3.39525714567478e-05, "loss": 0.0454, "step": 1716 }, { "epoch": 1.288072018004501, "grad_norm": 0.17687004804611206, "learning_rate": 3.389057206645614e-05, "loss": 0.0413, "step": 1717 }, { "epoch": 1.2888222055513878, "grad_norm": 0.21190932393074036, "learning_rate": 3.382860030224708e-05, "loss": 0.0421, "step": 1718 }, { "epoch": 1.2895723930982745, "grad_norm": 0.16498279571533203, "learning_rate": 3.3766656270396074e-05, "loss": 0.0589, "step": 1719 }, { "epoch": 1.2903225806451613, "grad_norm": 0.20356465876102448, "learning_rate": 3.3704740077131036e-05, "loss": 0.0408, "step": 1720 }, { "epoch": 1.291072768192048, "grad_norm": 0.12303636223077774, "learning_rate": 3.3642851828632155e-05, "loss": 0.0296, "step": 1721 }, { "epoch": 1.2918229557389347, "grad_norm": 0.1438593566417694, "learning_rate": 3.3580991631031656e-05, "loss": 0.0349, "step": 1722 }, { "epoch": 1.2925731432858214, "grad_norm": 0.13478383421897888, "learning_rate": 3.3519159590413715e-05, "loss": 0.0339, "step": 1723 }, { "epoch": 1.2933233308327081, "grad_norm": 0.20928816497325897, "learning_rate": 3.345735581281417e-05, "loss": 0.0705, "step": 1724 }, { "epoch": 1.2940735183795948, "grad_norm": 0.15648962557315826, "learning_rate": 3.339558040422042e-05, "loss": 0.0361, "step": 1725 }, { "epoch": 1.2948237059264815, "grad_norm": 0.14362314343452454, "learning_rate": 3.333383347057123e-05, "loss": 0.0453, "step": 1726 }, { "epoch": 1.2955738934733683, "grad_norm": 0.2117234170436859, "learning_rate": 3.3272115117756476e-05, "loss": 0.0385, "step": 1727 }, { "epoch": 1.296324081020255, "grad_norm": 0.1841832548379898, "learning_rate": 3.3210425451617074e-05, "loss": 0.0396, "step": 1728 }, { "epoch": 1.2970742685671417, "grad_norm": 0.13548365235328674, "learning_rate": 3.314876457794474e-05, "loss": 0.0256, "step": 1729 }, { "epoch": 1.2978244561140286, "grad_norm": 0.1532963514328003, "learning_rate": 3.3087132602481774e-05, "loss": 0.0306, "step": 1730 }, { "epoch": 1.2985746436609151, "grad_norm": 0.14116926491260529, "learning_rate": 3.302552963092096e-05, "loss": 0.0312, "step": 1731 }, { "epoch": 1.299324831207802, "grad_norm": 0.1805046647787094, "learning_rate": 3.296395576890532e-05, "loss": 0.0597, "step": 1732 }, { "epoch": 1.3000750187546886, "grad_norm": 0.20268777012825012, "learning_rate": 3.290241112202797e-05, "loss": 0.051, "step": 1733 }, { "epoch": 1.3008252063015755, "grad_norm": 0.45598918199539185, "learning_rate": 3.284089579583192e-05, "loss": 0.0613, "step": 1734 }, { "epoch": 1.301575393848462, "grad_norm": 0.15112002193927765, "learning_rate": 3.2779409895809886e-05, "loss": 0.0344, "step": 1735 }, { "epoch": 1.302325581395349, "grad_norm": 0.13411392271518707, "learning_rate": 3.2717953527404155e-05, "loss": 0.0368, "step": 1736 }, { "epoch": 1.3030757689422354, "grad_norm": 0.2052137404680252, "learning_rate": 3.265652679600631e-05, "loss": 0.0556, "step": 1737 }, { "epoch": 1.3038259564891224, "grad_norm": 0.21926912665367126, "learning_rate": 3.25951298069572e-05, "loss": 0.0439, "step": 1738 }, { "epoch": 1.304576144036009, "grad_norm": 0.18303723633289337, "learning_rate": 3.253376266554655e-05, "loss": 0.0514, "step": 1739 }, { "epoch": 1.3053263315828958, "grad_norm": 0.188846617937088, "learning_rate": 3.247242547701301e-05, "loss": 0.0406, "step": 1740 }, { "epoch": 1.3060765191297825, "grad_norm": 0.22483086585998535, "learning_rate": 3.241111834654382e-05, "loss": 0.0608, "step": 1741 }, { "epoch": 1.3068267066766692, "grad_norm": 0.19677475094795227, "learning_rate": 3.234984137927464e-05, "loss": 0.0471, "step": 1742 }, { "epoch": 1.307576894223556, "grad_norm": 0.20563624799251556, "learning_rate": 3.228859468028946e-05, "loss": 0.0508, "step": 1743 }, { "epoch": 1.3083270817704427, "grad_norm": 0.3104844391345978, "learning_rate": 3.222737835462034e-05, "loss": 0.0721, "step": 1744 }, { "epoch": 1.3090772693173294, "grad_norm": 0.12385188043117523, "learning_rate": 3.216619250724724e-05, "loss": 0.0274, "step": 1745 }, { "epoch": 1.309827456864216, "grad_norm": 0.16119906306266785, "learning_rate": 3.2105037243097866e-05, "loss": 0.0411, "step": 1746 }, { "epoch": 1.3105776444111028, "grad_norm": 0.2001246064901352, "learning_rate": 3.2043912667047465e-05, "loss": 0.0504, "step": 1747 }, { "epoch": 1.3113278319579895, "grad_norm": 0.14651797711849213, "learning_rate": 3.198281888391869e-05, "loss": 0.033, "step": 1748 }, { "epoch": 1.3120780195048762, "grad_norm": 0.18450623750686646, "learning_rate": 3.192175599848133e-05, "loss": 0.0469, "step": 1749 }, { "epoch": 1.312828207051763, "grad_norm": 0.1982034295797348, "learning_rate": 3.1860724115452234e-05, "loss": 0.0459, "step": 1750 }, { "epoch": 1.3135783945986497, "grad_norm": 0.12932223081588745, "learning_rate": 3.179972333949509e-05, "loss": 0.0352, "step": 1751 }, { "epoch": 1.3143285821455364, "grad_norm": 0.18193656206130981, "learning_rate": 3.173875377522019e-05, "loss": 0.0422, "step": 1752 }, { "epoch": 1.315078769692423, "grad_norm": 0.18322080373764038, "learning_rate": 3.167781552718435e-05, "loss": 0.0373, "step": 1753 }, { "epoch": 1.3158289572393098, "grad_norm": 0.16063223779201508, "learning_rate": 3.161690869989068e-05, "loss": 0.0458, "step": 1754 }, { "epoch": 1.3165791447861965, "grad_norm": 0.1955423504114151, "learning_rate": 3.155603339778837e-05, "loss": 0.0442, "step": 1755 }, { "epoch": 1.3173293323330832, "grad_norm": 0.18707959353923798, "learning_rate": 3.149518972527257e-05, "loss": 0.038, "step": 1756 }, { "epoch": 1.31807951987997, "grad_norm": 0.19283926486968994, "learning_rate": 3.1434377786684197e-05, "loss": 0.0521, "step": 1757 }, { "epoch": 1.3188297074268567, "grad_norm": 0.14124998450279236, "learning_rate": 3.137359768630972e-05, "loss": 0.0356, "step": 1758 }, { "epoch": 1.3195798949737434, "grad_norm": 0.17933954298496246, "learning_rate": 3.131284952838106e-05, "loss": 0.039, "step": 1759 }, { "epoch": 1.32033008252063, "grad_norm": 0.13622942566871643, "learning_rate": 3.125213341707528e-05, "loss": 0.0468, "step": 1760 }, { "epoch": 1.3210802700675168, "grad_norm": 0.19437870383262634, "learning_rate": 3.1191449456514575e-05, "loss": 0.0451, "step": 1761 }, { "epoch": 1.3218304576144035, "grad_norm": 0.14863534271717072, "learning_rate": 3.113079775076593e-05, "loss": 0.0361, "step": 1762 }, { "epoch": 1.3225806451612903, "grad_norm": 0.19207067787647247, "learning_rate": 3.107017840384107e-05, "loss": 0.053, "step": 1763 }, { "epoch": 1.323330832708177, "grad_norm": 0.1651337742805481, "learning_rate": 3.100959151969619e-05, "loss": 0.0484, "step": 1764 }, { "epoch": 1.3240810202550637, "grad_norm": 0.1622065007686615, "learning_rate": 3.0949037202231826e-05, "loss": 0.0361, "step": 1765 }, { "epoch": 1.3248312078019504, "grad_norm": 0.20351789891719818, "learning_rate": 3.08885155552927e-05, "loss": 0.0384, "step": 1766 }, { "epoch": 1.3255813953488373, "grad_norm": 0.1585976481437683, "learning_rate": 3.082802668266743e-05, "loss": 0.0434, "step": 1767 }, { "epoch": 1.3263315828957238, "grad_norm": 0.1370503306388855, "learning_rate": 3.076757068808852e-05, "loss": 0.0394, "step": 1768 }, { "epoch": 1.3270817704426108, "grad_norm": 0.2939653694629669, "learning_rate": 3.070714767523203e-05, "loss": 0.0864, "step": 1769 }, { "epoch": 1.3278319579894973, "grad_norm": 0.19903366267681122, "learning_rate": 3.0646757747717475e-05, "loss": 0.0481, "step": 1770 }, { "epoch": 1.3285821455363842, "grad_norm": 0.1912355273962021, "learning_rate": 3.0586401009107636e-05, "loss": 0.0426, "step": 1771 }, { "epoch": 1.3293323330832707, "grad_norm": 0.2867732048034668, "learning_rate": 3.0526077562908386e-05, "loss": 0.0628, "step": 1772 }, { "epoch": 1.3300825206301576, "grad_norm": 0.20162878930568695, "learning_rate": 3.0465787512568466e-05, "loss": 0.0463, "step": 1773 }, { "epoch": 1.3308327081770441, "grad_norm": 0.15980598330497742, "learning_rate": 3.040553096147942e-05, "loss": 0.0481, "step": 1774 }, { "epoch": 1.331582895723931, "grad_norm": 0.20228786766529083, "learning_rate": 3.0345308012975255e-05, "loss": 0.05, "step": 1775 }, { "epoch": 1.3323330832708178, "grad_norm": 0.1600971817970276, "learning_rate": 3.0285118770332428e-05, "loss": 0.0336, "step": 1776 }, { "epoch": 1.3330832708177045, "grad_norm": 0.18647384643554688, "learning_rate": 3.022496333676954e-05, "loss": 0.048, "step": 1777 }, { "epoch": 1.3338334583645912, "grad_norm": 0.18941383063793182, "learning_rate": 3.0164841815447263e-05, "loss": 0.0525, "step": 1778 }, { "epoch": 1.334583645911478, "grad_norm": 0.1617792248725891, "learning_rate": 3.0104754309468066e-05, "loss": 0.036, "step": 1779 }, { "epoch": 1.3353338334583646, "grad_norm": 0.26556825637817383, "learning_rate": 3.00447009218761e-05, "loss": 0.0658, "step": 1780 }, { "epoch": 1.3360840210052514, "grad_norm": 0.17816261947155, "learning_rate": 2.9984681755657017e-05, "loss": 0.0447, "step": 1781 }, { "epoch": 1.336834208552138, "grad_norm": 0.17538125813007355, "learning_rate": 2.9924696913737792e-05, "loss": 0.0626, "step": 1782 }, { "epoch": 1.3375843960990248, "grad_norm": 0.18380333483219147, "learning_rate": 2.986474649898651e-05, "loss": 0.0449, "step": 1783 }, { "epoch": 1.3383345836459115, "grad_norm": 0.17858687043190002, "learning_rate": 2.9804830614212242e-05, "loss": 0.0371, "step": 1784 }, { "epoch": 1.3390847711927982, "grad_norm": 0.19844569265842438, "learning_rate": 2.9744949362164798e-05, "loss": 0.0414, "step": 1785 }, { "epoch": 1.339834958739685, "grad_norm": 0.1734311580657959, "learning_rate": 2.9685102845534658e-05, "loss": 0.042, "step": 1786 }, { "epoch": 1.3405851462865717, "grad_norm": 0.1407657116651535, "learning_rate": 2.9625291166952702e-05, "loss": 0.0615, "step": 1787 }, { "epoch": 1.3413353338334584, "grad_norm": 0.20041944086551666, "learning_rate": 2.956551442899005e-05, "loss": 0.0642, "step": 1788 }, { "epoch": 1.342085521380345, "grad_norm": 0.18350687623023987, "learning_rate": 2.9505772734157948e-05, "loss": 0.0562, "step": 1789 }, { "epoch": 1.3428357089272318, "grad_norm": 0.20079989731311798, "learning_rate": 2.9446066184907495e-05, "loss": 0.0646, "step": 1790 }, { "epoch": 1.3435858964741185, "grad_norm": 0.17485657334327698, "learning_rate": 2.9386394883629565e-05, "loss": 0.0414, "step": 1791 }, { "epoch": 1.3443360840210052, "grad_norm": 0.2188062071800232, "learning_rate": 2.932675893265454e-05, "loss": 0.0834, "step": 1792 }, { "epoch": 1.345086271567892, "grad_norm": 0.1565270721912384, "learning_rate": 2.926715843425223e-05, "loss": 0.0452, "step": 1793 }, { "epoch": 1.3458364591147787, "grad_norm": 0.16900846362113953, "learning_rate": 2.9207593490631592e-05, "loss": 0.0454, "step": 1794 }, { "epoch": 1.3465866466616654, "grad_norm": 0.18272441625595093, "learning_rate": 2.914806420394064e-05, "loss": 0.0546, "step": 1795 }, { "epoch": 1.347336834208552, "grad_norm": 0.18571123480796814, "learning_rate": 2.908857067626629e-05, "loss": 0.0303, "step": 1796 }, { "epoch": 1.3480870217554388, "grad_norm": 0.17477184534072876, "learning_rate": 2.902911300963403e-05, "loss": 0.0569, "step": 1797 }, { "epoch": 1.3488372093023255, "grad_norm": 0.16123870015144348, "learning_rate": 2.8969691306007918e-05, "loss": 0.0316, "step": 1798 }, { "epoch": 1.3495873968492123, "grad_norm": 0.14804422855377197, "learning_rate": 2.891030566729032e-05, "loss": 0.0489, "step": 1799 }, { "epoch": 1.350337584396099, "grad_norm": 0.16255329549312592, "learning_rate": 2.8850956195321795e-05, "loss": 0.0481, "step": 1800 }, { "epoch": 1.350337584396099, "eval_loss": 0.06747418642044067, "eval_runtime": 2.6568, "eval_samples_per_second": 20.325, "eval_steps_per_second": 5.27, "step": 1800 }, { "epoch": 1.3510877719429857, "grad_norm": 0.2039135992527008, "learning_rate": 2.8791642991880784e-05, "loss": 0.0394, "step": 1801 }, { "epoch": 1.3518379594898724, "grad_norm": 0.1728277951478958, "learning_rate": 2.873236615868362e-05, "loss": 0.0559, "step": 1802 }, { "epoch": 1.3525881470367591, "grad_norm": 0.1896568089723587, "learning_rate": 2.8673125797384243e-05, "loss": 0.0436, "step": 1803 }, { "epoch": 1.3533383345836458, "grad_norm": 0.13357765972614288, "learning_rate": 2.8613922009574024e-05, "loss": 0.0338, "step": 1804 }, { "epoch": 1.3540885221305325, "grad_norm": 0.1681651473045349, "learning_rate": 2.8554754896781656e-05, "loss": 0.0479, "step": 1805 }, { "epoch": 1.3548387096774195, "grad_norm": 0.11592215299606323, "learning_rate": 2.8495624560472866e-05, "loss": 0.0266, "step": 1806 }, { "epoch": 1.355588897224306, "grad_norm": 0.22524559497833252, "learning_rate": 2.843653110205039e-05, "loss": 0.0572, "step": 1807 }, { "epoch": 1.356339084771193, "grad_norm": 0.21835164725780487, "learning_rate": 2.8377474622853683e-05, "loss": 0.0618, "step": 1808 }, { "epoch": 1.3570892723180794, "grad_norm": 0.16861413419246674, "learning_rate": 2.8318455224158786e-05, "loss": 0.0376, "step": 1809 }, { "epoch": 1.3578394598649663, "grad_norm": 0.1737951785326004, "learning_rate": 2.8259473007178163e-05, "loss": 0.0469, "step": 1810 }, { "epoch": 1.3585896474118528, "grad_norm": 0.1801527738571167, "learning_rate": 2.8200528073060507e-05, "loss": 0.0421, "step": 1811 }, { "epoch": 1.3593398349587398, "grad_norm": 0.203287273645401, "learning_rate": 2.814162052289058e-05, "loss": 0.0465, "step": 1812 }, { "epoch": 1.3600900225056263, "grad_norm": 0.14048676192760468, "learning_rate": 2.8082750457689033e-05, "loss": 0.0327, "step": 1813 }, { "epoch": 1.3608402100525132, "grad_norm": 0.16412879526615143, "learning_rate": 2.8023917978412207e-05, "loss": 0.0349, "step": 1814 }, { "epoch": 1.3615903975994, "grad_norm": 0.18040567636489868, "learning_rate": 2.7965123185952023e-05, "loss": 0.0466, "step": 1815 }, { "epoch": 1.3623405851462866, "grad_norm": 0.19568736851215363, "learning_rate": 2.7906366181135775e-05, "loss": 0.0449, "step": 1816 }, { "epoch": 1.3630907726931734, "grad_norm": 0.1617717146873474, "learning_rate": 2.7847647064725924e-05, "loss": 0.0372, "step": 1817 }, { "epoch": 1.36384096024006, "grad_norm": 0.14724799990653992, "learning_rate": 2.778896593741999e-05, "loss": 0.0315, "step": 1818 }, { "epoch": 1.3645911477869468, "grad_norm": 0.09599991142749786, "learning_rate": 2.77303228998503e-05, "loss": 0.0195, "step": 1819 }, { "epoch": 1.3653413353338335, "grad_norm": 0.2500494718551636, "learning_rate": 2.7671718052583908e-05, "loss": 0.0723, "step": 1820 }, { "epoch": 1.3660915228807202, "grad_norm": 0.1894386112689972, "learning_rate": 2.7613151496122347e-05, "loss": 0.051, "step": 1821 }, { "epoch": 1.366841710427607, "grad_norm": 0.14874202013015747, "learning_rate": 2.7554623330901524e-05, "loss": 0.0496, "step": 1822 }, { "epoch": 1.3675918979744937, "grad_norm": 0.24037110805511475, "learning_rate": 2.749613365729141e-05, "loss": 0.0723, "step": 1823 }, { "epoch": 1.3683420855213804, "grad_norm": 0.19027547538280487, "learning_rate": 2.7437682575596104e-05, "loss": 0.0434, "step": 1824 }, { "epoch": 1.369092273068267, "grad_norm": 0.23864826560020447, "learning_rate": 2.7379270186053428e-05, "loss": 0.0655, "step": 1825 }, { "epoch": 1.3698424606151538, "grad_norm": 0.1366167664527893, "learning_rate": 2.7320896588834903e-05, "loss": 0.0398, "step": 1826 }, { "epoch": 1.3705926481620405, "grad_norm": 0.17698831856250763, "learning_rate": 2.7262561884045457e-05, "loss": 0.0354, "step": 1827 }, { "epoch": 1.3713428357089272, "grad_norm": 0.15502697229385376, "learning_rate": 2.720426617172339e-05, "loss": 0.0398, "step": 1828 }, { "epoch": 1.372093023255814, "grad_norm": 0.2184450775384903, "learning_rate": 2.71460095518401e-05, "loss": 0.0645, "step": 1829 }, { "epoch": 1.3728432108027007, "grad_norm": 0.16089852154254913, "learning_rate": 2.708779212429996e-05, "loss": 0.0406, "step": 1830 }, { "epoch": 1.3735933983495874, "grad_norm": 0.13291966915130615, "learning_rate": 2.702961398894014e-05, "loss": 0.032, "step": 1831 }, { "epoch": 1.374343585896474, "grad_norm": 0.2512020766735077, "learning_rate": 2.6971475245530375e-05, "loss": 0.0538, "step": 1832 }, { "epoch": 1.3750937734433608, "grad_norm": 0.18554271757602692, "learning_rate": 2.6913375993772915e-05, "loss": 0.0419, "step": 1833 }, { "epoch": 1.3758439609902475, "grad_norm": 0.18187479674816132, "learning_rate": 2.6855316333302237e-05, "loss": 0.0516, "step": 1834 }, { "epoch": 1.3765941485371342, "grad_norm": 0.2124125361442566, "learning_rate": 2.6797296363684977e-05, "loss": 0.0442, "step": 1835 }, { "epoch": 1.377344336084021, "grad_norm": 0.27118852734565735, "learning_rate": 2.6739316184419622e-05, "loss": 0.0541, "step": 1836 }, { "epoch": 1.3780945236309077, "grad_norm": 0.1586301028728485, "learning_rate": 2.6681375894936472e-05, "loss": 0.0331, "step": 1837 }, { "epoch": 1.3788447111777944, "grad_norm": 0.1749737709760666, "learning_rate": 2.662347559459746e-05, "loss": 0.0369, "step": 1838 }, { "epoch": 1.379594898724681, "grad_norm": 0.10476631671190262, "learning_rate": 2.6565615382695896e-05, "loss": 0.0264, "step": 1839 }, { "epoch": 1.3803450862715678, "grad_norm": 0.1476007103919983, "learning_rate": 2.6507795358456307e-05, "loss": 0.0586, "step": 1840 }, { "epoch": 1.3810952738184545, "grad_norm": 0.11646662652492523, "learning_rate": 2.6450015621034362e-05, "loss": 0.0252, "step": 1841 }, { "epoch": 1.3818454613653413, "grad_norm": 0.15108223259449005, "learning_rate": 2.6392276269516613e-05, "loss": 0.0314, "step": 1842 }, { "epoch": 1.3825956489122282, "grad_norm": 0.1876702457666397, "learning_rate": 2.63345774029204e-05, "loss": 0.0603, "step": 1843 }, { "epoch": 1.3833458364591147, "grad_norm": 0.15735001862049103, "learning_rate": 2.6276919120193543e-05, "loss": 0.0452, "step": 1844 }, { "epoch": 1.3840960240060016, "grad_norm": 0.16730938851833344, "learning_rate": 2.621930152021434e-05, "loss": 0.042, "step": 1845 }, { "epoch": 1.3848462115528881, "grad_norm": 0.12933415174484253, "learning_rate": 2.6161724701791306e-05, "loss": 0.0246, "step": 1846 }, { "epoch": 1.385596399099775, "grad_norm": 0.14777973294258118, "learning_rate": 2.6104188763663018e-05, "loss": 0.0299, "step": 1847 }, { "epoch": 1.3863465866466615, "grad_norm": 0.14805126190185547, "learning_rate": 2.604669380449795e-05, "loss": 0.0303, "step": 1848 }, { "epoch": 1.3870967741935485, "grad_norm": 0.1918485164642334, "learning_rate": 2.598923992289427e-05, "loss": 0.0364, "step": 1849 }, { "epoch": 1.387846961740435, "grad_norm": 0.16532783210277557, "learning_rate": 2.5931827217379746e-05, "loss": 0.0371, "step": 1850 }, { "epoch": 1.388597149287322, "grad_norm": 0.14476117491722107, "learning_rate": 2.5874455786411505e-05, "loss": 0.0367, "step": 1851 }, { "epoch": 1.3893473368342086, "grad_norm": 0.12052533775568008, "learning_rate": 2.5817125728375912e-05, "loss": 0.023, "step": 1852 }, { "epoch": 1.3900975243810954, "grad_norm": 0.13191531598567963, "learning_rate": 2.5759837141588362e-05, "loss": 0.0259, "step": 1853 }, { "epoch": 1.390847711927982, "grad_norm": 0.27699872851371765, "learning_rate": 2.5702590124293147e-05, "loss": 0.045, "step": 1854 }, { "epoch": 1.3915978994748688, "grad_norm": 0.16797631978988647, "learning_rate": 2.5645384774663262e-05, "loss": 0.0402, "step": 1855 }, { "epoch": 1.3923480870217555, "grad_norm": 0.11576727032661438, "learning_rate": 2.5588221190800264e-05, "loss": 0.0383, "step": 1856 }, { "epoch": 1.3930982745686422, "grad_norm": 0.3373456597328186, "learning_rate": 2.5531099470734038e-05, "loss": 0.0621, "step": 1857 }, { "epoch": 1.393848462115529, "grad_norm": 0.17893192172050476, "learning_rate": 2.5474019712422724e-05, "loss": 0.0275, "step": 1858 }, { "epoch": 1.3945986496624156, "grad_norm": 0.2578313648700714, "learning_rate": 2.541698201375249e-05, "loss": 0.0579, "step": 1859 }, { "epoch": 1.3953488372093024, "grad_norm": 0.15810662508010864, "learning_rate": 2.5359986472537373e-05, "loss": 0.0391, "step": 1860 }, { "epoch": 1.396099024756189, "grad_norm": 0.15665064752101898, "learning_rate": 2.530303318651913e-05, "loss": 0.0357, "step": 1861 }, { "epoch": 1.3968492123030758, "grad_norm": 0.187716543674469, "learning_rate": 2.5246122253366998e-05, "loss": 0.0397, "step": 1862 }, { "epoch": 1.3975993998499625, "grad_norm": 0.1098187267780304, "learning_rate": 2.5189253770677644e-05, "loss": 0.0242, "step": 1863 }, { "epoch": 1.3983495873968492, "grad_norm": 0.13694116473197937, "learning_rate": 2.5132427835974926e-05, "loss": 0.0208, "step": 1864 }, { "epoch": 1.399099774943736, "grad_norm": 0.12738709151744843, "learning_rate": 2.507564454670971e-05, "loss": 0.0238, "step": 1865 }, { "epoch": 1.3998499624906227, "grad_norm": 0.13771750032901764, "learning_rate": 2.5018904000259757e-05, "loss": 0.0372, "step": 1866 }, { "epoch": 1.4006001500375094, "grad_norm": 0.14189371466636658, "learning_rate": 2.4962206293929512e-05, "loss": 0.0328, "step": 1867 }, { "epoch": 1.401350337584396, "grad_norm": 0.1648484170436859, "learning_rate": 2.490555152494996e-05, "loss": 0.0369, "step": 1868 }, { "epoch": 1.4021005251312828, "grad_norm": 0.1412844955921173, "learning_rate": 2.4848939790478463e-05, "loss": 0.024, "step": 1869 }, { "epoch": 1.4028507126781695, "grad_norm": 0.12301353365182877, "learning_rate": 2.4792371187598544e-05, "loss": 0.0243, "step": 1870 }, { "epoch": 1.4036009002250562, "grad_norm": 0.1523589789867401, "learning_rate": 2.4735845813319804e-05, "loss": 0.0341, "step": 1871 }, { "epoch": 1.404351087771943, "grad_norm": 0.2967745065689087, "learning_rate": 2.4679363764577683e-05, "loss": 0.0538, "step": 1872 }, { "epoch": 1.4051012753188297, "grad_norm": 0.19742630422115326, "learning_rate": 2.462292513823336e-05, "loss": 0.054, "step": 1873 }, { "epoch": 1.4058514628657164, "grad_norm": 0.19836518168449402, "learning_rate": 2.4566530031073486e-05, "loss": 0.0372, "step": 1874 }, { "epoch": 1.406601650412603, "grad_norm": 0.2233537882566452, "learning_rate": 2.451017853981013e-05, "loss": 0.0294, "step": 1875 }, { "epoch": 1.4073518379594898, "grad_norm": 0.19185113906860352, "learning_rate": 2.4453870761080554e-05, "loss": 0.0606, "step": 1876 }, { "epoch": 1.4081020255063765, "grad_norm": 0.17896531522274017, "learning_rate": 2.4397606791447052e-05, "loss": 0.0479, "step": 1877 }, { "epoch": 1.4088522130532632, "grad_norm": 0.19202962517738342, "learning_rate": 2.4341386727396793e-05, "loss": 0.0554, "step": 1878 }, { "epoch": 1.40960240060015, "grad_norm": 0.15726254880428314, "learning_rate": 2.4285210665341646e-05, "loss": 0.0357, "step": 1879 }, { "epoch": 1.4103525881470367, "grad_norm": 0.18670770525932312, "learning_rate": 2.422907870161803e-05, "loss": 0.0381, "step": 1880 }, { "epoch": 1.4111027756939234, "grad_norm": 0.3125583231449127, "learning_rate": 2.4172990932486733e-05, "loss": 0.0461, "step": 1881 }, { "epoch": 1.4118529632408103, "grad_norm": 0.19309872388839722, "learning_rate": 2.4116947454132782e-05, "loss": 0.0294, "step": 1882 }, { "epoch": 1.4126031507876968, "grad_norm": 0.17205388844013214, "learning_rate": 2.4060948362665176e-05, "loss": 0.0387, "step": 1883 }, { "epoch": 1.4133533383345838, "grad_norm": 0.17235881090164185, "learning_rate": 2.4004993754116867e-05, "loss": 0.0367, "step": 1884 }, { "epoch": 1.4141035258814703, "grad_norm": 0.180396169424057, "learning_rate": 2.39490837244445e-05, "loss": 0.0409, "step": 1885 }, { "epoch": 1.4148537134283572, "grad_norm": 0.16136233508586884, "learning_rate": 2.389321836952828e-05, "loss": 0.0439, "step": 1886 }, { "epoch": 1.4156039009752437, "grad_norm": 0.23657025396823883, "learning_rate": 2.383739778517176e-05, "loss": 0.0392, "step": 1887 }, { "epoch": 1.4163540885221306, "grad_norm": 0.25855350494384766, "learning_rate": 2.3781622067101767e-05, "loss": 0.0626, "step": 1888 }, { "epoch": 1.4171042760690171, "grad_norm": 0.19857187569141388, "learning_rate": 2.372589131096816e-05, "loss": 0.0534, "step": 1889 }, { "epoch": 1.417854463615904, "grad_norm": 0.1421574205160141, "learning_rate": 2.36702056123437e-05, "loss": 0.0361, "step": 1890 }, { "epoch": 1.4186046511627908, "grad_norm": 0.1884581595659256, "learning_rate": 2.3614565066723892e-05, "loss": 0.0413, "step": 1891 }, { "epoch": 1.4193548387096775, "grad_norm": 0.21691051125526428, "learning_rate": 2.355896976952674e-05, "loss": 0.0492, "step": 1892 }, { "epoch": 1.4201050262565642, "grad_norm": 0.2113209068775177, "learning_rate": 2.350341981609276e-05, "loss": 0.0581, "step": 1893 }, { "epoch": 1.420855213803451, "grad_norm": 0.21820048987865448, "learning_rate": 2.344791530168465e-05, "loss": 0.0495, "step": 1894 }, { "epoch": 1.4216054013503376, "grad_norm": 0.15237504243850708, "learning_rate": 2.339245632148715e-05, "loss": 0.0431, "step": 1895 }, { "epoch": 1.4223555888972244, "grad_norm": 0.18294626474380493, "learning_rate": 2.3337042970606965e-05, "loss": 0.0436, "step": 1896 }, { "epoch": 1.423105776444111, "grad_norm": 0.1634780317544937, "learning_rate": 2.3281675344072545e-05, "loss": 0.0402, "step": 1897 }, { "epoch": 1.4238559639909978, "grad_norm": 0.2057954967021942, "learning_rate": 2.3226353536833907e-05, "loss": 0.054, "step": 1898 }, { "epoch": 1.4246061515378845, "grad_norm": 0.160613551735878, "learning_rate": 2.317107764376253e-05, "loss": 0.0464, "step": 1899 }, { "epoch": 1.4253563390847712, "grad_norm": 0.18292781710624695, "learning_rate": 2.3115847759651082e-05, "loss": 0.0353, "step": 1900 }, { "epoch": 1.426106526631658, "grad_norm": 0.16484901309013367, "learning_rate": 2.3060663979213404e-05, "loss": 0.0401, "step": 1901 }, { "epoch": 1.4268567141785446, "grad_norm": 0.20094150304794312, "learning_rate": 2.300552639708423e-05, "loss": 0.0437, "step": 1902 }, { "epoch": 1.4276069017254314, "grad_norm": 0.2095361202955246, "learning_rate": 2.2950435107819124e-05, "loss": 0.0466, "step": 1903 }, { "epoch": 1.428357089272318, "grad_norm": 0.2117299735546112, "learning_rate": 2.2895390205894164e-05, "loss": 0.0448, "step": 1904 }, { "epoch": 1.4291072768192048, "grad_norm": 0.3316866159439087, "learning_rate": 2.2840391785705967e-05, "loss": 0.0653, "step": 1905 }, { "epoch": 1.4298574643660915, "grad_norm": 0.19901429116725922, "learning_rate": 2.278543994157139e-05, "loss": 0.0507, "step": 1906 }, { "epoch": 1.4306076519129782, "grad_norm": 0.22417782247066498, "learning_rate": 2.2730534767727483e-05, "loss": 0.0465, "step": 1907 }, { "epoch": 1.431357839459865, "grad_norm": 0.16476863622665405, "learning_rate": 2.267567635833116e-05, "loss": 0.0706, "step": 1908 }, { "epoch": 1.4321080270067517, "grad_norm": 0.18171042203903198, "learning_rate": 2.2620864807459213e-05, "loss": 0.0417, "step": 1909 }, { "epoch": 1.4328582145536384, "grad_norm": 0.16148392856121063, "learning_rate": 2.2566100209108048e-05, "loss": 0.0473, "step": 1910 }, { "epoch": 1.433608402100525, "grad_norm": 0.20606650412082672, "learning_rate": 2.2511382657193565e-05, "loss": 0.0524, "step": 1911 }, { "epoch": 1.4343585896474118, "grad_norm": 0.18486949801445007, "learning_rate": 2.2456712245550993e-05, "loss": 0.0378, "step": 1912 }, { "epoch": 1.4351087771942985, "grad_norm": 0.1495230346918106, "learning_rate": 2.2402089067934668e-05, "loss": 0.0458, "step": 1913 }, { "epoch": 1.4358589647411852, "grad_norm": 0.1609950214624405, "learning_rate": 2.2347513218017974e-05, "loss": 0.0404, "step": 1914 }, { "epoch": 1.436609152288072, "grad_norm": 0.20721952617168427, "learning_rate": 2.2292984789393122e-05, "loss": 0.0455, "step": 1915 }, { "epoch": 1.4373593398349587, "grad_norm": 0.10836555808782578, "learning_rate": 2.2238503875571028e-05, "loss": 0.0248, "step": 1916 }, { "epoch": 1.4381095273818454, "grad_norm": 0.2047933042049408, "learning_rate": 2.218407056998104e-05, "loss": 0.0429, "step": 1917 }, { "epoch": 1.438859714928732, "grad_norm": 0.1827002912759781, "learning_rate": 2.2129684965970948e-05, "loss": 0.0248, "step": 1918 }, { "epoch": 1.439609902475619, "grad_norm": 0.2153414636850357, "learning_rate": 2.2075347156806697e-05, "loss": 0.0475, "step": 1919 }, { "epoch": 1.4403600900225055, "grad_norm": 0.19228680431842804, "learning_rate": 2.2021057235672288e-05, "loss": 0.0578, "step": 1920 }, { "epoch": 1.4411102775693925, "grad_norm": 0.17686350643634796, "learning_rate": 2.1966815295669585e-05, "loss": 0.042, "step": 1921 }, { "epoch": 1.441860465116279, "grad_norm": 0.1612776815891266, "learning_rate": 2.1912621429818177e-05, "loss": 0.0412, "step": 1922 }, { "epoch": 1.442610652663166, "grad_norm": 0.16681015491485596, "learning_rate": 2.18584757310552e-05, "loss": 0.0335, "step": 1923 }, { "epoch": 1.4433608402100524, "grad_norm": 0.16631953418254852, "learning_rate": 2.1804378292235224e-05, "loss": 0.0422, "step": 1924 }, { "epoch": 1.4441110277569393, "grad_norm": 0.1537289172410965, "learning_rate": 2.1750329206129988e-05, "loss": 0.0484, "step": 1925 }, { "epoch": 1.4448612153038258, "grad_norm": 0.1737208366394043, "learning_rate": 2.1696328565428364e-05, "loss": 0.0377, "step": 1926 }, { "epoch": 1.4456114028507128, "grad_norm": 0.14561980962753296, "learning_rate": 2.1642376462736148e-05, "loss": 0.0326, "step": 1927 }, { "epoch": 1.4463615903975993, "grad_norm": 0.3279992938041687, "learning_rate": 2.158847299057587e-05, "loss": 0.0619, "step": 1928 }, { "epoch": 1.4471117779444862, "grad_norm": 0.14300592243671417, "learning_rate": 2.1534618241386705e-05, "loss": 0.0441, "step": 1929 }, { "epoch": 1.447861965491373, "grad_norm": 0.19687087833881378, "learning_rate": 2.14808123075242e-05, "loss": 0.0643, "step": 1930 }, { "epoch": 1.4486121530382596, "grad_norm": 0.15415306389331818, "learning_rate": 2.1427055281260255e-05, "loss": 0.0371, "step": 1931 }, { "epoch": 1.4493623405851463, "grad_norm": 0.10696108639240265, "learning_rate": 2.1373347254782882e-05, "loss": 0.0289, "step": 1932 }, { "epoch": 1.450112528132033, "grad_norm": 0.17456509172916412, "learning_rate": 2.1319688320196048e-05, "loss": 0.0342, "step": 1933 }, { "epoch": 1.4508627156789198, "grad_norm": 0.18844592571258545, "learning_rate": 2.1266078569519542e-05, "loss": 0.0404, "step": 1934 }, { "epoch": 1.4516129032258065, "grad_norm": 0.1761346310377121, "learning_rate": 2.121251809468882e-05, "loss": 0.0444, "step": 1935 }, { "epoch": 1.4523630907726932, "grad_norm": 0.14905095100402832, "learning_rate": 2.1159006987554807e-05, "loss": 0.0456, "step": 1936 }, { "epoch": 1.45311327831958, "grad_norm": 0.1570758819580078, "learning_rate": 2.1105545339883808e-05, "loss": 0.0377, "step": 1937 }, { "epoch": 1.4538634658664666, "grad_norm": 0.20766982436180115, "learning_rate": 2.1052133243357253e-05, "loss": 0.0483, "step": 1938 }, { "epoch": 1.4546136534133534, "grad_norm": 0.1821688413619995, "learning_rate": 2.0998770789571636e-05, "loss": 0.0325, "step": 1939 }, { "epoch": 1.45536384096024, "grad_norm": 0.17206883430480957, "learning_rate": 2.0945458070038315e-05, "loss": 0.0523, "step": 1940 }, { "epoch": 1.4561140285071268, "grad_norm": 0.19073475897312164, "learning_rate": 2.0892195176183354e-05, "loss": 0.0393, "step": 1941 }, { "epoch": 1.4568642160540135, "grad_norm": 0.1695142388343811, "learning_rate": 2.083898219934739e-05, "loss": 0.0387, "step": 1942 }, { "epoch": 1.4576144036009002, "grad_norm": 0.1988031268119812, "learning_rate": 2.0785819230785398e-05, "loss": 0.0461, "step": 1943 }, { "epoch": 1.458364591147787, "grad_norm": 0.14169101417064667, "learning_rate": 2.073270636166666e-05, "loss": 0.0552, "step": 1944 }, { "epoch": 1.4591147786946737, "grad_norm": 0.1895540952682495, "learning_rate": 2.0679643683074513e-05, "loss": 0.0404, "step": 1945 }, { "epoch": 1.4598649662415604, "grad_norm": 0.15737725794315338, "learning_rate": 2.0626631286006236e-05, "loss": 0.0355, "step": 1946 }, { "epoch": 1.460615153788447, "grad_norm": 0.2160894274711609, "learning_rate": 2.0573669261372847e-05, "loss": 0.041, "step": 1947 }, { "epoch": 1.4613653413353338, "grad_norm": 0.17199784517288208, "learning_rate": 2.052075769999899e-05, "loss": 0.0443, "step": 1948 }, { "epoch": 1.4621155288822205, "grad_norm": 0.1754937618970871, "learning_rate": 2.046789669262283e-05, "loss": 0.0391, "step": 1949 }, { "epoch": 1.4628657164291072, "grad_norm": 0.15710797905921936, "learning_rate": 2.0415086329895784e-05, "loss": 0.0345, "step": 1950 }, { "epoch": 1.463615903975994, "grad_norm": 0.15425771474838257, "learning_rate": 2.0362326702382384e-05, "loss": 0.0294, "step": 1951 }, { "epoch": 1.4643660915228807, "grad_norm": 0.15144193172454834, "learning_rate": 2.0309617900560218e-05, "loss": 0.0364, "step": 1952 }, { "epoch": 1.4651162790697674, "grad_norm": 0.20072755217552185, "learning_rate": 2.0256960014819692e-05, "loss": 0.05, "step": 1953 }, { "epoch": 1.465866466616654, "grad_norm": 0.17849725484848022, "learning_rate": 2.020435313546391e-05, "loss": 0.0433, "step": 1954 }, { "epoch": 1.4666166541635408, "grad_norm": 0.30021312832832336, "learning_rate": 2.0151797352708457e-05, "loss": 0.0862, "step": 1955 }, { "epoch": 1.4673668417104275, "grad_norm": 0.211557537317276, "learning_rate": 2.0099292756681343e-05, "loss": 0.0625, "step": 1956 }, { "epoch": 1.4681170292573142, "grad_norm": 0.18450140953063965, "learning_rate": 2.0046839437422772e-05, "loss": 0.0407, "step": 1957 }, { "epoch": 1.4688672168042012, "grad_norm": 0.19659367203712463, "learning_rate": 1.999443748488503e-05, "loss": 0.0356, "step": 1958 }, { "epoch": 1.4696174043510877, "grad_norm": 0.18041273951530457, "learning_rate": 1.9942086988932323e-05, "loss": 0.0465, "step": 1959 }, { "epoch": 1.4703675918979746, "grad_norm": 0.13432402908802032, "learning_rate": 1.9889788039340558e-05, "loss": 0.0424, "step": 1960 }, { "epoch": 1.471117779444861, "grad_norm": 0.16820655763149261, "learning_rate": 1.9837540725797305e-05, "loss": 0.0405, "step": 1961 }, { "epoch": 1.471867966991748, "grad_norm": 0.18088965117931366, "learning_rate": 1.9785345137901533e-05, "loss": 0.0462, "step": 1962 }, { "epoch": 1.4726181545386345, "grad_norm": 0.14395685493946075, "learning_rate": 1.9733201365163607e-05, "loss": 0.0365, "step": 1963 }, { "epoch": 1.4733683420855215, "grad_norm": 0.2003966122865677, "learning_rate": 1.968110949700489e-05, "loss": 0.0589, "step": 1964 }, { "epoch": 1.474118529632408, "grad_norm": 0.15150992572307587, "learning_rate": 1.962906962275784e-05, "loss": 0.0371, "step": 1965 }, { "epoch": 1.474868717179295, "grad_norm": 0.16001686453819275, "learning_rate": 1.9577081831665707e-05, "loss": 0.0507, "step": 1966 }, { "epoch": 1.4756189047261816, "grad_norm": 0.1944255381822586, "learning_rate": 1.9525146212882456e-05, "loss": 0.0621, "step": 1967 }, { "epoch": 1.4763690922730683, "grad_norm": 0.15253233909606934, "learning_rate": 1.9473262855472517e-05, "loss": 0.0478, "step": 1968 }, { "epoch": 1.477119279819955, "grad_norm": 0.1423439085483551, "learning_rate": 1.942143184841077e-05, "loss": 0.038, "step": 1969 }, { "epoch": 1.4778694673668418, "grad_norm": 0.150668203830719, "learning_rate": 1.9369653280582273e-05, "loss": 0.0412, "step": 1970 }, { "epoch": 1.4786196549137285, "grad_norm": 0.25101804733276367, "learning_rate": 1.931792724078218e-05, "loss": 0.0646, "step": 1971 }, { "epoch": 1.4793698424606152, "grad_norm": 0.17958179116249084, "learning_rate": 1.9266253817715575e-05, "loss": 0.0383, "step": 1972 }, { "epoch": 1.480120030007502, "grad_norm": 0.20443418622016907, "learning_rate": 1.921463309999724e-05, "loss": 0.0373, "step": 1973 }, { "epoch": 1.4808702175543886, "grad_norm": 0.1706647425889969, "learning_rate": 1.9163065176151662e-05, "loss": 0.0426, "step": 1974 }, { "epoch": 1.4816204051012754, "grad_norm": 0.17196975648403168, "learning_rate": 1.9111550134612738e-05, "loss": 0.0405, "step": 1975 }, { "epoch": 1.482370592648162, "grad_norm": 0.2115897536277771, "learning_rate": 1.9060088063723696e-05, "loss": 0.0457, "step": 1976 }, { "epoch": 1.4831207801950488, "grad_norm": 0.1429477035999298, "learning_rate": 1.900867905173692e-05, "loss": 0.0374, "step": 1977 }, { "epoch": 1.4838709677419355, "grad_norm": 0.1279119849205017, "learning_rate": 1.8957323186813803e-05, "loss": 0.0285, "step": 1978 }, { "epoch": 1.4846211552888222, "grad_norm": 0.13002172112464905, "learning_rate": 1.8906020557024597e-05, "loss": 0.0288, "step": 1979 }, { "epoch": 1.485371342835709, "grad_norm": 0.19170233607292175, "learning_rate": 1.885477125034827e-05, "loss": 0.0396, "step": 1980 }, { "epoch": 1.4861215303825956, "grad_norm": 0.278576135635376, "learning_rate": 1.8803575354672315e-05, "loss": 0.0544, "step": 1981 }, { "epoch": 1.4868717179294824, "grad_norm": 0.1751520186662674, "learning_rate": 1.8752432957792654e-05, "loss": 0.0458, "step": 1982 }, { "epoch": 1.487621905476369, "grad_norm": 0.27717238664627075, "learning_rate": 1.8701344147413474e-05, "loss": 0.107, "step": 1983 }, { "epoch": 1.4883720930232558, "grad_norm": 0.14058585464954376, "learning_rate": 1.8650309011147053e-05, "loss": 0.0354, "step": 1984 }, { "epoch": 1.4891222805701425, "grad_norm": 0.1992284208536148, "learning_rate": 1.8599327636513636e-05, "loss": 0.037, "step": 1985 }, { "epoch": 1.4898724681170292, "grad_norm": 0.1688821017742157, "learning_rate": 1.8548400110941228e-05, "loss": 0.0317, "step": 1986 }, { "epoch": 1.490622655663916, "grad_norm": 0.21011146903038025, "learning_rate": 1.8497526521765534e-05, "loss": 0.0434, "step": 1987 }, { "epoch": 1.4913728432108027, "grad_norm": 0.13675890862941742, "learning_rate": 1.844670695622976e-05, "loss": 0.0268, "step": 1988 }, { "epoch": 1.4921230307576894, "grad_norm": 0.15465782582759857, "learning_rate": 1.8395941501484464e-05, "loss": 0.0332, "step": 1989 }, { "epoch": 1.492873218304576, "grad_norm": 0.1339779645204544, "learning_rate": 1.8345230244587354e-05, "loss": 0.0336, "step": 1990 }, { "epoch": 1.4936234058514628, "grad_norm": 0.16057631373405457, "learning_rate": 1.829457327250329e-05, "loss": 0.0392, "step": 1991 }, { "epoch": 1.4943735933983495, "grad_norm": 0.240296870470047, "learning_rate": 1.8243970672103982e-05, "loss": 0.0459, "step": 1992 }, { "epoch": 1.4951237809452362, "grad_norm": 0.2193770408630371, "learning_rate": 1.8193422530167914e-05, "loss": 0.0572, "step": 1993 }, { "epoch": 1.495873968492123, "grad_norm": 0.16321928799152374, "learning_rate": 1.8142928933380142e-05, "loss": 0.0391, "step": 1994 }, { "epoch": 1.49662415603901, "grad_norm": 0.14024445414543152, "learning_rate": 1.8092489968332233e-05, "loss": 0.0336, "step": 1995 }, { "epoch": 1.4973743435858964, "grad_norm": 0.15499421954154968, "learning_rate": 1.804210572152204e-05, "loss": 0.0409, "step": 1996 }, { "epoch": 1.4981245311327833, "grad_norm": 0.1850685328245163, "learning_rate": 1.7991776279353604e-05, "loss": 0.0511, "step": 1997 }, { "epoch": 1.4988747186796698, "grad_norm": 0.17802977561950684, "learning_rate": 1.794150172813693e-05, "loss": 0.0332, "step": 1998 }, { "epoch": 1.4996249062265568, "grad_norm": 0.2455652356147766, "learning_rate": 1.7891282154087934e-05, "loss": 0.0458, "step": 1999 }, { "epoch": 1.5003750937734432, "grad_norm": 0.16653724014759064, "learning_rate": 1.7841117643328246e-05, "loss": 0.045, "step": 2000 }, { "epoch": 1.5003750937734432, "eval_loss": 0.06891622394323349, "eval_runtime": 2.6405, "eval_samples_per_second": 20.451, "eval_steps_per_second": 5.302, "step": 2000 }, { "epoch": 1.5011252813203302, "grad_norm": 0.30463671684265137, "learning_rate": 1.779100828188506e-05, "loss": 0.0654, "step": 2001 }, { "epoch": 1.5018754688672167, "grad_norm": 0.18647217750549316, "learning_rate": 1.774095415569102e-05, "loss": 0.0356, "step": 2002 }, { "epoch": 1.5026256564141036, "grad_norm": 0.17118555307388306, "learning_rate": 1.7690955350583976e-05, "loss": 0.037, "step": 2003 }, { "epoch": 1.50337584396099, "grad_norm": 0.26206356287002563, "learning_rate": 1.764101195230696e-05, "loss": 0.0774, "step": 2004 }, { "epoch": 1.504126031507877, "grad_norm": 0.5694636702537537, "learning_rate": 1.7591124046508045e-05, "loss": 0.1328, "step": 2005 }, { "epoch": 1.5048762190547635, "grad_norm": 0.16807466745376587, "learning_rate": 1.7541291718740012e-05, "loss": 0.0425, "step": 2006 }, { "epoch": 1.5056264066016505, "grad_norm": 0.1880795806646347, "learning_rate": 1.7491515054460418e-05, "loss": 0.0416, "step": 2007 }, { "epoch": 1.506376594148537, "grad_norm": 0.15772967040538788, "learning_rate": 1.7441794139031337e-05, "loss": 0.0561, "step": 2008 }, { "epoch": 1.507126781695424, "grad_norm": 0.26514455676078796, "learning_rate": 1.7392129057719246e-05, "loss": 0.0603, "step": 2009 }, { "epoch": 1.5078769692423106, "grad_norm": 0.17776095867156982, "learning_rate": 1.7342519895694886e-05, "loss": 0.0555, "step": 2010 }, { "epoch": 1.5086271567891973, "grad_norm": 0.24651385843753815, "learning_rate": 1.7292966738033057e-05, "loss": 0.0476, "step": 2011 }, { "epoch": 1.509377344336084, "grad_norm": 0.19671979546546936, "learning_rate": 1.7243469669712546e-05, "loss": 0.0439, "step": 2012 }, { "epoch": 1.5101275318829708, "grad_norm": 0.180504709482193, "learning_rate": 1.7194028775615966e-05, "loss": 0.0494, "step": 2013 }, { "epoch": 1.5108777194298575, "grad_norm": 0.20517569780349731, "learning_rate": 1.714464414052958e-05, "loss": 0.0524, "step": 2014 }, { "epoch": 1.5116279069767442, "grad_norm": 0.1717517226934433, "learning_rate": 1.7095315849143184e-05, "loss": 0.0551, "step": 2015 }, { "epoch": 1.512378094523631, "grad_norm": 0.1254064440727234, "learning_rate": 1.704604398604991e-05, "loss": 0.0306, "step": 2016 }, { "epoch": 1.5131282820705176, "grad_norm": 0.19165663421154022, "learning_rate": 1.6996828635746165e-05, "loss": 0.0387, "step": 2017 }, { "epoch": 1.5138784696174044, "grad_norm": 0.16394397616386414, "learning_rate": 1.6947669882631434e-05, "loss": 0.0382, "step": 2018 }, { "epoch": 1.514628657164291, "grad_norm": 0.17494425177574158, "learning_rate": 1.6898567811008135e-05, "loss": 0.0379, "step": 2019 }, { "epoch": 1.5153788447111778, "grad_norm": 0.15336370468139648, "learning_rate": 1.684952250508149e-05, "loss": 0.0355, "step": 2020 }, { "epoch": 1.5161290322580645, "grad_norm": 0.12618416547775269, "learning_rate": 1.6800534048959364e-05, "loss": 0.026, "step": 2021 }, { "epoch": 1.5168792198049512, "grad_norm": 0.15845142304897308, "learning_rate": 1.6751602526652133e-05, "loss": 0.0362, "step": 2022 }, { "epoch": 1.517629407351838, "grad_norm": 0.2082812488079071, "learning_rate": 1.6702728022072562e-05, "loss": 0.0461, "step": 2023 }, { "epoch": 1.5183795948987246, "grad_norm": 0.12379974126815796, "learning_rate": 1.665391061903558e-05, "loss": 0.0312, "step": 2024 }, { "epoch": 1.5191297824456114, "grad_norm": 0.21030539274215698, "learning_rate": 1.660515040125824e-05, "loss": 0.059, "step": 2025 }, { "epoch": 1.519879969992498, "grad_norm": 0.13408705592155457, "learning_rate": 1.6556447452359512e-05, "loss": 0.0384, "step": 2026 }, { "epoch": 1.5206301575393848, "grad_norm": 0.25092118978500366, "learning_rate": 1.6507801855860177e-05, "loss": 0.0496, "step": 2027 }, { "epoch": 1.5213803450862715, "grad_norm": 0.15542156994342804, "learning_rate": 1.645921369518261e-05, "loss": 0.0322, "step": 2028 }, { "epoch": 1.5221305326331582, "grad_norm": 0.18500564992427826, "learning_rate": 1.6410683053650737e-05, "loss": 0.0442, "step": 2029 }, { "epoch": 1.5228807201800452, "grad_norm": 0.21039170026779175, "learning_rate": 1.636221001448983e-05, "loss": 0.042, "step": 2030 }, { "epoch": 1.5236309077269317, "grad_norm": 0.21939733624458313, "learning_rate": 1.631379466082638e-05, "loss": 0.0555, "step": 2031 }, { "epoch": 1.5243810952738186, "grad_norm": 0.19629713892936707, "learning_rate": 1.626543707568795e-05, "loss": 0.0428, "step": 2032 }, { "epoch": 1.525131282820705, "grad_norm": 0.31679531931877136, "learning_rate": 1.6217137342003036e-05, "loss": 0.0398, "step": 2033 }, { "epoch": 1.525881470367592, "grad_norm": 0.23860694468021393, "learning_rate": 1.616889554260092e-05, "loss": 0.0612, "step": 2034 }, { "epoch": 1.5266316579144785, "grad_norm": 0.16307824850082397, "learning_rate": 1.6120711760211548e-05, "loss": 0.0429, "step": 2035 }, { "epoch": 1.5273818454613655, "grad_norm": 0.13818341493606567, "learning_rate": 1.607258607746537e-05, "loss": 0.0341, "step": 2036 }, { "epoch": 1.528132033008252, "grad_norm": 0.16487646102905273, "learning_rate": 1.602451857689316e-05, "loss": 0.0391, "step": 2037 }, { "epoch": 1.528882220555139, "grad_norm": 0.2172565758228302, "learning_rate": 1.5976509340925977e-05, "loss": 0.0566, "step": 2038 }, { "epoch": 1.5296324081020254, "grad_norm": 0.1681651920080185, "learning_rate": 1.5928558451894914e-05, "loss": 0.0375, "step": 2039 }, { "epoch": 1.5303825956489123, "grad_norm": 0.23445692658424377, "learning_rate": 1.588066599203106e-05, "loss": 0.0397, "step": 2040 }, { "epoch": 1.5311327831957988, "grad_norm": 0.1395978331565857, "learning_rate": 1.583283204346521e-05, "loss": 0.0345, "step": 2041 }, { "epoch": 1.5318829707426858, "grad_norm": 0.20669519901275635, "learning_rate": 1.5785056688227916e-05, "loss": 0.033, "step": 2042 }, { "epoch": 1.5326331582895723, "grad_norm": 0.19717498123645782, "learning_rate": 1.5737340008249202e-05, "loss": 0.0428, "step": 2043 }, { "epoch": 1.5333833458364592, "grad_norm": 0.15539968013763428, "learning_rate": 1.5689682085358465e-05, "loss": 0.0383, "step": 2044 }, { "epoch": 1.5341335333833457, "grad_norm": 0.1587790548801422, "learning_rate": 1.564208300128438e-05, "loss": 0.0326, "step": 2045 }, { "epoch": 1.5348837209302326, "grad_norm": 0.172133207321167, "learning_rate": 1.5594542837654625e-05, "loss": 0.0285, "step": 2046 }, { "epoch": 1.5356339084771191, "grad_norm": 0.17575670778751373, "learning_rate": 1.554706167599596e-05, "loss": 0.0442, "step": 2047 }, { "epoch": 1.536384096024006, "grad_norm": 0.20420925319194794, "learning_rate": 1.5499639597733902e-05, "loss": 0.0458, "step": 2048 }, { "epoch": 1.5371342835708928, "grad_norm": 0.20247098803520203, "learning_rate": 1.54522766841926e-05, "loss": 0.0538, "step": 2049 }, { "epoch": 1.5378844711177795, "grad_norm": 0.21190738677978516, "learning_rate": 1.540497301659482e-05, "loss": 0.0468, "step": 2050 }, { "epoch": 1.5386346586646662, "grad_norm": 0.170917809009552, "learning_rate": 1.5357728676061685e-05, "loss": 0.0456, "step": 2051 }, { "epoch": 1.539384846211553, "grad_norm": 0.30017176270484924, "learning_rate": 1.5310543743612582e-05, "loss": 0.0822, "step": 2052 }, { "epoch": 1.5401350337584396, "grad_norm": 0.17813549935817719, "learning_rate": 1.526341830016505e-05, "loss": 0.0382, "step": 2053 }, { "epoch": 1.5408852213053263, "grad_norm": 0.2268344908952713, "learning_rate": 1.5216352426534548e-05, "loss": 0.0514, "step": 2054 }, { "epoch": 1.541635408852213, "grad_norm": 0.19404159486293793, "learning_rate": 1.5169346203434425e-05, "loss": 0.0608, "step": 2055 }, { "epoch": 1.5423855963990998, "grad_norm": 0.21496349573135376, "learning_rate": 1.5122399711475732e-05, "loss": 0.0615, "step": 2056 }, { "epoch": 1.5431357839459865, "grad_norm": 0.1785394698381424, "learning_rate": 1.50755130311671e-05, "loss": 0.0438, "step": 2057 }, { "epoch": 1.5438859714928732, "grad_norm": 0.21393193304538727, "learning_rate": 1.502868624291452e-05, "loss": 0.0478, "step": 2058 }, { "epoch": 1.54463615903976, "grad_norm": 0.19037163257598877, "learning_rate": 1.4981919427021357e-05, "loss": 0.0493, "step": 2059 }, { "epoch": 1.5453863465866466, "grad_norm": 0.15680454671382904, "learning_rate": 1.493521266368807e-05, "loss": 0.0307, "step": 2060 }, { "epoch": 1.5461365341335334, "grad_norm": 0.19426371157169342, "learning_rate": 1.4888566033012201e-05, "loss": 0.0612, "step": 2061 }, { "epoch": 1.54688672168042, "grad_norm": 0.14693467319011688, "learning_rate": 1.4841979614988094e-05, "loss": 0.0273, "step": 2062 }, { "epoch": 1.5476369092273068, "grad_norm": 0.18175330758094788, "learning_rate": 1.4795453489506878e-05, "loss": 0.0683, "step": 2063 }, { "epoch": 1.5483870967741935, "grad_norm": 0.12993456423282623, "learning_rate": 1.4748987736356273e-05, "loss": 0.0367, "step": 2064 }, { "epoch": 1.5491372843210802, "grad_norm": 0.17994779348373413, "learning_rate": 1.4702582435220475e-05, "loss": 0.0612, "step": 2065 }, { "epoch": 1.549887471867967, "grad_norm": 0.20236118137836456, "learning_rate": 1.4656237665680017e-05, "loss": 0.0523, "step": 2066 }, { "epoch": 1.5506376594148539, "grad_norm": 0.11795151978731155, "learning_rate": 1.4609953507211593e-05, "loss": 0.0331, "step": 2067 }, { "epoch": 1.5513878469617404, "grad_norm": 0.1285896897315979, "learning_rate": 1.4563730039187984e-05, "loss": 0.0342, "step": 2068 }, { "epoch": 1.5521380345086273, "grad_norm": 0.11817178130149841, "learning_rate": 1.4517567340877886e-05, "loss": 0.0279, "step": 2069 }, { "epoch": 1.5528882220555138, "grad_norm": 0.15962854027748108, "learning_rate": 1.4471465491445802e-05, "loss": 0.035, "step": 2070 }, { "epoch": 1.5536384096024007, "grad_norm": 0.19675575196743011, "learning_rate": 1.4425424569951822e-05, "loss": 0.0373, "step": 2071 }, { "epoch": 1.5543885971492872, "grad_norm": 0.23658345639705658, "learning_rate": 1.4379444655351626e-05, "loss": 0.0793, "step": 2072 }, { "epoch": 1.5551387846961742, "grad_norm": 0.15732000768184662, "learning_rate": 1.4333525826496224e-05, "loss": 0.0233, "step": 2073 }, { "epoch": 1.5558889722430607, "grad_norm": 0.21282753348350525, "learning_rate": 1.4287668162131896e-05, "loss": 0.0594, "step": 2074 }, { "epoch": 1.5566391597899476, "grad_norm": 0.19476816058158875, "learning_rate": 1.4241871740900014e-05, "loss": 0.045, "step": 2075 }, { "epoch": 1.557389347336834, "grad_norm": 0.1995515525341034, "learning_rate": 1.4196136641336932e-05, "loss": 0.0501, "step": 2076 }, { "epoch": 1.558139534883721, "grad_norm": 0.18209566175937653, "learning_rate": 1.4150462941873843e-05, "loss": 0.0435, "step": 2077 }, { "epoch": 1.5588897224306075, "grad_norm": 0.16033656895160675, "learning_rate": 1.410485072083666e-05, "loss": 0.0378, "step": 2078 }, { "epoch": 1.5596399099774945, "grad_norm": 0.1421896368265152, "learning_rate": 1.4059300056445823e-05, "loss": 0.0489, "step": 2079 }, { "epoch": 1.560390097524381, "grad_norm": 0.16776643693447113, "learning_rate": 1.4013811026816243e-05, "loss": 0.0466, "step": 2080 }, { "epoch": 1.561140285071268, "grad_norm": 0.20549526810646057, "learning_rate": 1.3968383709957133e-05, "loss": 0.032, "step": 2081 }, { "epoch": 1.5618904726181544, "grad_norm": 0.17907895147800446, "learning_rate": 1.3923018183771868e-05, "loss": 0.0481, "step": 2082 }, { "epoch": 1.5626406601650413, "grad_norm": 0.2195027768611908, "learning_rate": 1.3877714526057872e-05, "loss": 0.0551, "step": 2083 }, { "epoch": 1.5633908477119278, "grad_norm": 0.18014267086982727, "learning_rate": 1.3832472814506425e-05, "loss": 0.0357, "step": 2084 }, { "epoch": 1.5641410352588148, "grad_norm": 0.24690528213977814, "learning_rate": 1.3787293126702622e-05, "loss": 0.0514, "step": 2085 }, { "epoch": 1.5648912228057015, "grad_norm": 0.2089156061410904, "learning_rate": 1.3742175540125179e-05, "loss": 0.0528, "step": 2086 }, { "epoch": 1.5656414103525882, "grad_norm": 0.17088919878005981, "learning_rate": 1.3697120132146318e-05, "loss": 0.0387, "step": 2087 }, { "epoch": 1.566391597899475, "grad_norm": 0.13750265538692474, "learning_rate": 1.3652126980031627e-05, "loss": 0.0311, "step": 2088 }, { "epoch": 1.5671417854463616, "grad_norm": 0.24524101614952087, "learning_rate": 1.3607196160939927e-05, "loss": 0.0619, "step": 2089 }, { "epoch": 1.5678919729932483, "grad_norm": 0.14645838737487793, "learning_rate": 1.3562327751923149e-05, "loss": 0.0429, "step": 2090 }, { "epoch": 1.568642160540135, "grad_norm": 0.21580055356025696, "learning_rate": 1.351752182992621e-05, "loss": 0.0603, "step": 2091 }, { "epoch": 1.5693923480870218, "grad_norm": 0.14457224309444427, "learning_rate": 1.3472778471786829e-05, "loss": 0.0258, "step": 2092 }, { "epoch": 1.5701425356339085, "grad_norm": 0.1384057253599167, "learning_rate": 1.3428097754235475e-05, "loss": 0.0366, "step": 2093 }, { "epoch": 1.5708927231807952, "grad_norm": 0.1908542960882187, "learning_rate": 1.3383479753895174e-05, "loss": 0.0517, "step": 2094 }, { "epoch": 1.571642910727682, "grad_norm": 0.21990883350372314, "learning_rate": 1.33389245472814e-05, "loss": 0.0398, "step": 2095 }, { "epoch": 1.5723930982745686, "grad_norm": 0.17277023196220398, "learning_rate": 1.3294432210801966e-05, "loss": 0.0362, "step": 2096 }, { "epoch": 1.5731432858214554, "grad_norm": 0.17331083118915558, "learning_rate": 1.3250002820756819e-05, "loss": 0.0446, "step": 2097 }, { "epoch": 1.573893473368342, "grad_norm": 0.14335882663726807, "learning_rate": 1.3205636453338e-05, "loss": 0.0268, "step": 2098 }, { "epoch": 1.5746436609152288, "grad_norm": 0.1492685228586197, "learning_rate": 1.316133318462946e-05, "loss": 0.038, "step": 2099 }, { "epoch": 1.5753938484621155, "grad_norm": 0.20548713207244873, "learning_rate": 1.3117093090606958e-05, "loss": 0.047, "step": 2100 }, { "epoch": 1.5761440360090022, "grad_norm": 0.16951590776443481, "learning_rate": 1.3072916247137861e-05, "loss": 0.0266, "step": 2101 }, { "epoch": 1.576894223555889, "grad_norm": 0.1733119636774063, "learning_rate": 1.302880272998112e-05, "loss": 0.0343, "step": 2102 }, { "epoch": 1.5776444111027756, "grad_norm": 0.1762966811656952, "learning_rate": 1.29847526147871e-05, "loss": 0.0336, "step": 2103 }, { "epoch": 1.5783945986496624, "grad_norm": 0.21547256410121918, "learning_rate": 1.2940765977097402e-05, "loss": 0.0429, "step": 2104 }, { "epoch": 1.579144786196549, "grad_norm": 0.14866936206817627, "learning_rate": 1.2896842892344751e-05, "loss": 0.0278, "step": 2105 }, { "epoch": 1.579894973743436, "grad_norm": 0.19238527119159698, "learning_rate": 1.2852983435852928e-05, "loss": 0.0467, "step": 2106 }, { "epoch": 1.5806451612903225, "grad_norm": 0.1957983523607254, "learning_rate": 1.2809187682836588e-05, "loss": 0.0352, "step": 2107 }, { "epoch": 1.5813953488372094, "grad_norm": 0.16528469324111938, "learning_rate": 1.2765455708401142e-05, "loss": 0.0461, "step": 2108 }, { "epoch": 1.582145536384096, "grad_norm": 0.23004306852817535, "learning_rate": 1.2721787587542595e-05, "loss": 0.0634, "step": 2109 }, { "epoch": 1.5828957239309829, "grad_norm": 0.1990731805562973, "learning_rate": 1.2678183395147485e-05, "loss": 0.0564, "step": 2110 }, { "epoch": 1.5836459114778694, "grad_norm": 0.188127338886261, "learning_rate": 1.2634643205992707e-05, "loss": 0.0334, "step": 2111 }, { "epoch": 1.5843960990247563, "grad_norm": 0.2097140997648239, "learning_rate": 1.2591167094745404e-05, "loss": 0.0612, "step": 2112 }, { "epoch": 1.5851462865716428, "grad_norm": 0.12479346990585327, "learning_rate": 1.2547755135962841e-05, "loss": 0.0348, "step": 2113 }, { "epoch": 1.5858964741185297, "grad_norm": 0.17138241231441498, "learning_rate": 1.2504407404092217e-05, "loss": 0.0466, "step": 2114 }, { "epoch": 1.5866466616654162, "grad_norm": 0.17071551084518433, "learning_rate": 1.2461123973470634e-05, "loss": 0.039, "step": 2115 }, { "epoch": 1.5873968492123032, "grad_norm": 0.21277554333209991, "learning_rate": 1.2417904918324913e-05, "loss": 0.0347, "step": 2116 }, { "epoch": 1.5881470367591897, "grad_norm": 0.18354669213294983, "learning_rate": 1.237475031277151e-05, "loss": 0.0344, "step": 2117 }, { "epoch": 1.5888972243060766, "grad_norm": 0.2000560760498047, "learning_rate": 1.2331660230816288e-05, "loss": 0.0502, "step": 2118 }, { "epoch": 1.589647411852963, "grad_norm": 0.18581894040107727, "learning_rate": 1.2288634746354505e-05, "loss": 0.0331, "step": 2119 }, { "epoch": 1.59039759939985, "grad_norm": 0.16207176446914673, "learning_rate": 1.2245673933170626e-05, "loss": 0.0321, "step": 2120 }, { "epoch": 1.5911477869467365, "grad_norm": 0.11233672499656677, "learning_rate": 1.2202777864938236e-05, "loss": 0.0323, "step": 2121 }, { "epoch": 1.5918979744936235, "grad_norm": 0.1832706332206726, "learning_rate": 1.2159946615219836e-05, "loss": 0.028, "step": 2122 }, { "epoch": 1.59264816204051, "grad_norm": 0.20577500760555267, "learning_rate": 1.211718025746682e-05, "loss": 0.0278, "step": 2123 }, { "epoch": 1.593398349587397, "grad_norm": 0.22615359723567963, "learning_rate": 1.2074478865019273e-05, "loss": 0.0455, "step": 2124 }, { "epoch": 1.5941485371342836, "grad_norm": 0.15508152544498444, "learning_rate": 1.2031842511105885e-05, "loss": 0.039, "step": 2125 }, { "epoch": 1.5948987246811703, "grad_norm": 0.1680385321378708, "learning_rate": 1.1989271268843815e-05, "loss": 0.0363, "step": 2126 }, { "epoch": 1.595648912228057, "grad_norm": 0.21748638153076172, "learning_rate": 1.1946765211238526e-05, "loss": 0.05, "step": 2127 }, { "epoch": 1.5963990997749438, "grad_norm": 0.2185010313987732, "learning_rate": 1.1904324411183731e-05, "loss": 0.0454, "step": 2128 }, { "epoch": 1.5971492873218305, "grad_norm": 0.2044178694486618, "learning_rate": 1.1861948941461226e-05, "loss": 0.0399, "step": 2129 }, { "epoch": 1.5978994748687172, "grad_norm": 0.27306026220321655, "learning_rate": 1.1819638874740769e-05, "loss": 0.0459, "step": 2130 }, { "epoch": 1.598649662415604, "grad_norm": 0.17152051627635956, "learning_rate": 1.1777394283579956e-05, "loss": 0.031, "step": 2131 }, { "epoch": 1.5993998499624906, "grad_norm": 0.1820501983165741, "learning_rate": 1.1735215240424102e-05, "loss": 0.0395, "step": 2132 }, { "epoch": 1.6001500375093773, "grad_norm": 0.1924869865179062, "learning_rate": 1.1693101817606117e-05, "loss": 0.064, "step": 2133 }, { "epoch": 1.600900225056264, "grad_norm": 0.1441008746623993, "learning_rate": 1.165105408734638e-05, "loss": 0.0269, "step": 2134 }, { "epoch": 1.6016504126031508, "grad_norm": 0.16650429368019104, "learning_rate": 1.1609072121752584e-05, "loss": 0.0536, "step": 2135 }, { "epoch": 1.6024006001500375, "grad_norm": 0.16469474136829376, "learning_rate": 1.1567155992819678e-05, "loss": 0.0431, "step": 2136 }, { "epoch": 1.6031507876969242, "grad_norm": 0.22901701927185059, "learning_rate": 1.15253057724297e-05, "loss": 0.0562, "step": 2137 }, { "epoch": 1.603900975243811, "grad_norm": 0.26473501324653625, "learning_rate": 1.1483521532351654e-05, "loss": 0.0505, "step": 2138 }, { "epoch": 1.6046511627906976, "grad_norm": 0.14857040345668793, "learning_rate": 1.144180334424141e-05, "loss": 0.0343, "step": 2139 }, { "epoch": 1.6054013503375844, "grad_norm": 0.1499057114124298, "learning_rate": 1.1400151279641525e-05, "loss": 0.0259, "step": 2140 }, { "epoch": 1.606151537884471, "grad_norm": 0.1453072428703308, "learning_rate": 1.1358565409981203e-05, "loss": 0.0292, "step": 2141 }, { "epoch": 1.6069017254313578, "grad_norm": 0.18547675013542175, "learning_rate": 1.1317045806576121e-05, "loss": 0.039, "step": 2142 }, { "epoch": 1.6076519129782447, "grad_norm": 0.2546239197254181, "learning_rate": 1.12755925406283e-05, "loss": 0.0736, "step": 2143 }, { "epoch": 1.6084021005251312, "grad_norm": 0.1635519117116928, "learning_rate": 1.1234205683226012e-05, "loss": 0.0301, "step": 2144 }, { "epoch": 1.6091522880720182, "grad_norm": 0.14328426122665405, "learning_rate": 1.1192885305343648e-05, "loss": 0.0429, "step": 2145 }, { "epoch": 1.6099024756189046, "grad_norm": 0.21794961392879486, "learning_rate": 1.1151631477841584e-05, "loss": 0.0577, "step": 2146 }, { "epoch": 1.6106526631657916, "grad_norm": 0.19154462218284607, "learning_rate": 1.1110444271466086e-05, "loss": 0.046, "step": 2147 }, { "epoch": 1.611402850712678, "grad_norm": 0.16161414980888367, "learning_rate": 1.1069323756849126e-05, "loss": 0.0561, "step": 2148 }, { "epoch": 1.612153038259565, "grad_norm": 0.21912692487239838, "learning_rate": 1.102827000450835e-05, "loss": 0.0428, "step": 2149 }, { "epoch": 1.6129032258064515, "grad_norm": 0.20931370556354523, "learning_rate": 1.0987283084846905e-05, "loss": 0.0545, "step": 2150 }, { "epoch": 1.6136534133533385, "grad_norm": 0.16395579278469086, "learning_rate": 1.0946363068153343e-05, "loss": 0.0344, "step": 2151 }, { "epoch": 1.614403600900225, "grad_norm": 0.24834044277668, "learning_rate": 1.0905510024601423e-05, "loss": 0.0487, "step": 2152 }, { "epoch": 1.6151537884471119, "grad_norm": 0.15264178812503815, "learning_rate": 1.0864724024250106e-05, "loss": 0.0316, "step": 2153 }, { "epoch": 1.6159039759939984, "grad_norm": 0.10665671527385712, "learning_rate": 1.0824005137043375e-05, "loss": 0.0192, "step": 2154 }, { "epoch": 1.6166541635408853, "grad_norm": 0.17089134454727173, "learning_rate": 1.0783353432810106e-05, "loss": 0.0426, "step": 2155 }, { "epoch": 1.6174043510877718, "grad_norm": 0.22492165863513947, "learning_rate": 1.0742768981263984e-05, "loss": 0.0708, "step": 2156 }, { "epoch": 1.6181545386346587, "grad_norm": 0.11420857906341553, "learning_rate": 1.070225185200331e-05, "loss": 0.0254, "step": 2157 }, { "epoch": 1.6189047261815452, "grad_norm": 0.2135169953107834, "learning_rate": 1.0661802114511005e-05, "loss": 0.0719, "step": 2158 }, { "epoch": 1.6196549137284322, "grad_norm": 0.18792471289634705, "learning_rate": 1.062141983815439e-05, "loss": 0.0557, "step": 2159 }, { "epoch": 1.6204051012753187, "grad_norm": 0.17168648540973663, "learning_rate": 1.0581105092185062e-05, "loss": 0.0556, "step": 2160 }, { "epoch": 1.6211552888222056, "grad_norm": 0.15384042263031006, "learning_rate": 1.0540857945738852e-05, "loss": 0.0441, "step": 2161 }, { "epoch": 1.6219054763690923, "grad_norm": 0.11775700747966766, "learning_rate": 1.0500678467835662e-05, "loss": 0.0239, "step": 2162 }, { "epoch": 1.622655663915979, "grad_norm": 0.1715250313282013, "learning_rate": 1.0460566727379335e-05, "loss": 0.0487, "step": 2163 }, { "epoch": 1.6234058514628658, "grad_norm": 0.18916262686252594, "learning_rate": 1.0420522793157567e-05, "loss": 0.0467, "step": 2164 }, { "epoch": 1.6241560390097525, "grad_norm": 0.29144486784935, "learning_rate": 1.038054673384174e-05, "loss": 0.0472, "step": 2165 }, { "epoch": 1.6249062265566392, "grad_norm": 0.16755078732967377, "learning_rate": 1.0340638617986864e-05, "loss": 0.035, "step": 2166 }, { "epoch": 1.625656414103526, "grad_norm": 0.1979805827140808, "learning_rate": 1.030079851403144e-05, "loss": 0.0474, "step": 2167 }, { "epoch": 1.6264066016504126, "grad_norm": 0.21421858668327332, "learning_rate": 1.0261026490297315e-05, "loss": 0.0465, "step": 2168 }, { "epoch": 1.6271567891972993, "grad_norm": 0.19427688419818878, "learning_rate": 1.022132261498961e-05, "loss": 0.061, "step": 2169 }, { "epoch": 1.627906976744186, "grad_norm": 0.1771208941936493, "learning_rate": 1.0181686956196529e-05, "loss": 0.0451, "step": 2170 }, { "epoch": 1.6286571642910728, "grad_norm": 0.19580066204071045, "learning_rate": 1.0142119581889332e-05, "loss": 0.0619, "step": 2171 }, { "epoch": 1.6294073518379595, "grad_norm": 0.20414824783802032, "learning_rate": 1.0102620559922204e-05, "loss": 0.0409, "step": 2172 }, { "epoch": 1.6301575393848462, "grad_norm": 0.19486220180988312, "learning_rate": 1.0063189958032043e-05, "loss": 0.0449, "step": 2173 }, { "epoch": 1.630907726931733, "grad_norm": 0.27886173129081726, "learning_rate": 1.0023827843838457e-05, "loss": 0.0652, "step": 2174 }, { "epoch": 1.6316579144786196, "grad_norm": 0.17671063542366028, "learning_rate": 9.984534284843594e-06, "loss": 0.0487, "step": 2175 }, { "epoch": 1.6324081020255063, "grad_norm": 0.24155737459659576, "learning_rate": 9.945309348432047e-06, "loss": 0.057, "step": 2176 }, { "epoch": 1.633158289572393, "grad_norm": 0.14792385697364807, "learning_rate": 9.906153101870725e-06, "loss": 0.0376, "step": 2177 }, { "epoch": 1.6339084771192798, "grad_norm": 0.2489890605211258, "learning_rate": 9.867065612308713e-06, "loss": 0.0447, "step": 2178 }, { "epoch": 1.6346586646661665, "grad_norm": 0.16253811120986938, "learning_rate": 9.82804694677722e-06, "loss": 0.0548, "step": 2179 }, { "epoch": 1.6354088522130532, "grad_norm": 0.20979243516921997, "learning_rate": 9.78909717218941e-06, "loss": 0.0375, "step": 2180 }, { "epoch": 1.63615903975994, "grad_norm": 0.16433793306350708, "learning_rate": 9.75021635534033e-06, "loss": 0.0362, "step": 2181 }, { "epoch": 1.6369092273068269, "grad_norm": 0.13124553859233856, "learning_rate": 9.711404562906717e-06, "loss": 0.0303, "step": 2182 }, { "epoch": 1.6376594148537134, "grad_norm": 0.2880100905895233, "learning_rate": 9.672661861447002e-06, "loss": 0.0638, "step": 2183 }, { "epoch": 1.6384096024006003, "grad_norm": 0.2295902669429779, "learning_rate": 9.633988317401087e-06, "loss": 0.0447, "step": 2184 }, { "epoch": 1.6391597899474868, "grad_norm": 0.22860339283943176, "learning_rate": 9.595383997090302e-06, "loss": 0.0571, "step": 2185 }, { "epoch": 1.6399099774943737, "grad_norm": 0.2065700888633728, "learning_rate": 9.556848966717247e-06, "loss": 0.0612, "step": 2186 }, { "epoch": 1.6406601650412602, "grad_norm": 0.18000207841396332, "learning_rate": 9.518383292365713e-06, "loss": 0.0428, "step": 2187 }, { "epoch": 1.6414103525881472, "grad_norm": 0.18875566124916077, "learning_rate": 9.479987040000538e-06, "loss": 0.0407, "step": 2188 }, { "epoch": 1.6421605401350337, "grad_norm": 0.18922288715839386, "learning_rate": 9.441660275467512e-06, "loss": 0.0433, "step": 2189 }, { "epoch": 1.6429107276819206, "grad_norm": 0.19266372919082642, "learning_rate": 9.403403064493282e-06, "loss": 0.0424, "step": 2190 }, { "epoch": 1.643660915228807, "grad_norm": 0.18794898688793182, "learning_rate": 9.365215472685163e-06, "loss": 0.0674, "step": 2191 }, { "epoch": 1.644411102775694, "grad_norm": 0.25597575306892395, "learning_rate": 9.32709756553114e-06, "loss": 0.0435, "step": 2192 }, { "epoch": 1.6451612903225805, "grad_norm": 0.2104489505290985, "learning_rate": 9.289049408399659e-06, "loss": 0.0417, "step": 2193 }, { "epoch": 1.6459114778694675, "grad_norm": 0.17522476613521576, "learning_rate": 9.251071066539579e-06, "loss": 0.0411, "step": 2194 }, { "epoch": 1.646661665416354, "grad_norm": 0.21473297476768494, "learning_rate": 9.21316260507999e-06, "loss": 0.0542, "step": 2195 }, { "epoch": 1.6474118529632409, "grad_norm": 0.18432407081127167, "learning_rate": 9.175324089030185e-06, "loss": 0.0482, "step": 2196 }, { "epoch": 1.6481620405101274, "grad_norm": 0.18733084201812744, "learning_rate": 9.137555583279495e-06, "loss": 0.045, "step": 2197 }, { "epoch": 1.6489122280570143, "grad_norm": 0.23316556215286255, "learning_rate": 9.099857152597185e-06, "loss": 0.0437, "step": 2198 }, { "epoch": 1.6496624156039008, "grad_norm": 0.1609441488981247, "learning_rate": 9.062228861632354e-06, "loss": 0.0389, "step": 2199 }, { "epoch": 1.6504126031507877, "grad_norm": 0.1733076125383377, "learning_rate": 9.024670774913812e-06, "loss": 0.0471, "step": 2200 }, { "epoch": 1.6504126031507877, "eval_loss": 0.06824243068695068, "eval_runtime": 2.661, "eval_samples_per_second": 20.293, "eval_steps_per_second": 5.261, "step": 2200 }, { "epoch": 1.6511627906976745, "grad_norm": 0.2067050188779831, "learning_rate": 8.987182956849983e-06, "loss": 0.0539, "step": 2201 }, { "epoch": 1.6519129782445612, "grad_norm": 0.177916020154953, "learning_rate": 8.949765471728789e-06, "loss": 0.0442, "step": 2202 }, { "epoch": 1.652663165791448, "grad_norm": 0.25947102904319763, "learning_rate": 8.912418383717513e-06, "loss": 0.0599, "step": 2203 }, { "epoch": 1.6534133533383346, "grad_norm": 0.1599457561969757, "learning_rate": 8.875141756862749e-06, "loss": 0.0352, "step": 2204 }, { "epoch": 1.6541635408852213, "grad_norm": 0.24875929951667786, "learning_rate": 8.837935655090241e-06, "loss": 0.0419, "step": 2205 }, { "epoch": 1.654913728432108, "grad_norm": 0.14069050550460815, "learning_rate": 8.800800142204779e-06, "loss": 0.0283, "step": 2206 }, { "epoch": 1.6556639159789948, "grad_norm": 0.17972294986248016, "learning_rate": 8.763735281890133e-06, "loss": 0.0311, "step": 2207 }, { "epoch": 1.6564141035258815, "grad_norm": 0.13864688575267792, "learning_rate": 8.726741137708866e-06, "loss": 0.0264, "step": 2208 }, { "epoch": 1.6571642910727682, "grad_norm": 0.1534036248922348, "learning_rate": 8.689817773102293e-06, "loss": 0.0284, "step": 2209 }, { "epoch": 1.657914478619655, "grad_norm": 0.231153205037117, "learning_rate": 8.65296525139036e-06, "loss": 0.0596, "step": 2210 }, { "epoch": 1.6586646661665416, "grad_norm": 0.21296992897987366, "learning_rate": 8.616183635771525e-06, "loss": 0.0418, "step": 2211 }, { "epoch": 1.6594148537134283, "grad_norm": 0.1815291792154312, "learning_rate": 8.579472989322602e-06, "loss": 0.0419, "step": 2212 }, { "epoch": 1.660165041260315, "grad_norm": 0.16089360415935516, "learning_rate": 8.542833374998744e-06, "loss": 0.0344, "step": 2213 }, { "epoch": 1.6609152288072018, "grad_norm": 0.35123497247695923, "learning_rate": 8.5062648556333e-06, "loss": 0.0642, "step": 2214 }, { "epoch": 1.6616654163540885, "grad_norm": 0.15117251873016357, "learning_rate": 8.469767493937681e-06, "loss": 0.0468, "step": 2215 }, { "epoch": 1.6624156039009752, "grad_norm": 0.1996140033006668, "learning_rate": 8.43334135250125e-06, "loss": 0.0451, "step": 2216 }, { "epoch": 1.663165791447862, "grad_norm": 0.1861112117767334, "learning_rate": 8.39698649379126e-06, "loss": 0.0437, "step": 2217 }, { "epoch": 1.6639159789947486, "grad_norm": 0.21752230823040009, "learning_rate": 8.360702980152713e-06, "loss": 0.0522, "step": 2218 }, { "epoch": 1.6646661665416356, "grad_norm": 0.19835425913333893, "learning_rate": 8.32449087380826e-06, "loss": 0.0337, "step": 2219 }, { "epoch": 1.665416354088522, "grad_norm": 0.18913261592388153, "learning_rate": 8.288350236858117e-06, "loss": 0.0622, "step": 2220 }, { "epoch": 1.666166541635409, "grad_norm": 0.22755774855613708, "learning_rate": 8.252281131279887e-06, "loss": 0.0503, "step": 2221 }, { "epoch": 1.6669167291822955, "grad_norm": 0.14972837269306183, "learning_rate": 8.21628361892855e-06, "loss": 0.0353, "step": 2222 }, { "epoch": 1.6676669167291824, "grad_norm": 0.16990503668785095, "learning_rate": 8.180357761536296e-06, "loss": 0.0348, "step": 2223 }, { "epoch": 1.668417104276069, "grad_norm": 0.21821126341819763, "learning_rate": 8.14450362071244e-06, "loss": 0.0561, "step": 2224 }, { "epoch": 1.6691672918229559, "grad_norm": 0.2086024284362793, "learning_rate": 8.10872125794328e-06, "loss": 0.0665, "step": 2225 }, { "epoch": 1.6699174793698424, "grad_norm": 0.18031302094459534, "learning_rate": 8.073010734592057e-06, "loss": 0.0305, "step": 2226 }, { "epoch": 1.6706676669167293, "grad_norm": 0.14847469329833984, "learning_rate": 8.037372111898789e-06, "loss": 0.0218, "step": 2227 }, { "epoch": 1.6714178544636158, "grad_norm": 0.2060483992099762, "learning_rate": 8.001805450980249e-06, "loss": 0.0472, "step": 2228 }, { "epoch": 1.6721680420105027, "grad_norm": 0.11653721332550049, "learning_rate": 7.966310812829709e-06, "loss": 0.0248, "step": 2229 }, { "epoch": 1.6729182295573892, "grad_norm": 0.2690200209617615, "learning_rate": 7.930888258316998e-06, "loss": 0.0594, "step": 2230 }, { "epoch": 1.6736684171042762, "grad_norm": 0.2156667709350586, "learning_rate": 7.89553784818831e-06, "loss": 0.0542, "step": 2231 }, { "epoch": 1.6744186046511627, "grad_norm": 0.1282075047492981, "learning_rate": 7.860259643066126e-06, "loss": 0.0209, "step": 2232 }, { "epoch": 1.6751687921980496, "grad_norm": 0.186884343624115, "learning_rate": 7.82505370344907e-06, "loss": 0.0573, "step": 2233 }, { "epoch": 1.675918979744936, "grad_norm": 0.13679596781730652, "learning_rate": 7.789920089711871e-06, "loss": 0.0264, "step": 2234 }, { "epoch": 1.676669167291823, "grad_norm": 0.14465893805027008, "learning_rate": 7.754858862105224e-06, "loss": 0.0226, "step": 2235 }, { "epoch": 1.6774193548387095, "grad_norm": 0.16200248897075653, "learning_rate": 7.71987008075568e-06, "loss": 0.0292, "step": 2236 }, { "epoch": 1.6781695423855965, "grad_norm": 0.20817679166793823, "learning_rate": 7.684953805665562e-06, "loss": 0.0463, "step": 2237 }, { "epoch": 1.6789197299324832, "grad_norm": 0.14461885392665863, "learning_rate": 7.65011009671282e-06, "loss": 0.0439, "step": 2238 }, { "epoch": 1.67966991747937, "grad_norm": 0.17346525192260742, "learning_rate": 7.615339013651001e-06, "loss": 0.0569, "step": 2239 }, { "epoch": 1.6804201050262566, "grad_norm": 0.11375146359205246, "learning_rate": 7.580640616109081e-06, "loss": 0.0219, "step": 2240 }, { "epoch": 1.6811702925731433, "grad_norm": 0.1951054185628891, "learning_rate": 7.546014963591397e-06, "loss": 0.0417, "step": 2241 }, { "epoch": 1.68192048012003, "grad_norm": 0.18035517632961273, "learning_rate": 7.511462115477536e-06, "loss": 0.0473, "step": 2242 }, { "epoch": 1.6826706676669168, "grad_norm": 0.21403639018535614, "learning_rate": 7.476982131022231e-06, "loss": 0.0366, "step": 2243 }, { "epoch": 1.6834208552138035, "grad_norm": 0.23407121002674103, "learning_rate": 7.442575069355256e-06, "loss": 0.0387, "step": 2244 }, { "epoch": 1.6841710427606902, "grad_norm": 0.17773695290088654, "learning_rate": 7.408240989481347e-06, "loss": 0.0399, "step": 2245 }, { "epoch": 1.684921230307577, "grad_norm": 0.25798365473747253, "learning_rate": 7.373979950280046e-06, "loss": 0.0537, "step": 2246 }, { "epoch": 1.6856714178544636, "grad_norm": 0.16044406592845917, "learning_rate": 7.33979201050568e-06, "loss": 0.0296, "step": 2247 }, { "epoch": 1.6864216054013503, "grad_norm": 0.182396799325943, "learning_rate": 7.3056772287871886e-06, "loss": 0.0296, "step": 2248 }, { "epoch": 1.687171792948237, "grad_norm": 0.21489420533180237, "learning_rate": 7.2716356636280684e-06, "loss": 0.0528, "step": 2249 }, { "epoch": 1.6879219804951238, "grad_norm": 0.18872296810150146, "learning_rate": 7.237667373406259e-06, "loss": 0.0326, "step": 2250 }, { "epoch": 1.6886721680420105, "grad_norm": 0.16861742734909058, "learning_rate": 7.203772416374016e-06, "loss": 0.0558, "step": 2251 }, { "epoch": 1.6894223555888972, "grad_norm": 0.22231639921665192, "learning_rate": 7.1699508506578636e-06, "loss": 0.0459, "step": 2252 }, { "epoch": 1.690172543135784, "grad_norm": 0.1857728660106659, "learning_rate": 7.136202734258457e-06, "loss": 0.0422, "step": 2253 }, { "epoch": 1.6909227306826706, "grad_norm": 0.20071634650230408, "learning_rate": 7.1025281250505006e-06, "loss": 0.0436, "step": 2254 }, { "epoch": 1.6916729182295573, "grad_norm": 0.20031875371932983, "learning_rate": 7.0689270807826e-06, "loss": 0.0341, "step": 2255 }, { "epoch": 1.692423105776444, "grad_norm": 0.16592131555080414, "learning_rate": 7.035399659077268e-06, "loss": 0.0429, "step": 2256 }, { "epoch": 1.6931732933233308, "grad_norm": 0.2352314591407776, "learning_rate": 7.00194591743073e-06, "loss": 0.0437, "step": 2257 }, { "epoch": 1.6939234808702177, "grad_norm": 0.17028115689754486, "learning_rate": 6.96856591321286e-06, "loss": 0.0333, "step": 2258 }, { "epoch": 1.6946736684171042, "grad_norm": 0.19160404801368713, "learning_rate": 6.9352597036670575e-06, "loss": 0.0613, "step": 2259 }, { "epoch": 1.6954238559639911, "grad_norm": 0.17852510511875153, "learning_rate": 6.902027345910211e-06, "loss": 0.038, "step": 2260 }, { "epoch": 1.6961740435108776, "grad_norm": 0.1913464069366455, "learning_rate": 6.868868896932534e-06, "loss": 0.0611, "step": 2261 }, { "epoch": 1.6969242310577646, "grad_norm": 0.2039322406053543, "learning_rate": 6.835784413597512e-06, "loss": 0.0546, "step": 2262 }, { "epoch": 1.697674418604651, "grad_norm": 0.1932404786348343, "learning_rate": 6.802773952641761e-06, "loss": 0.0452, "step": 2263 }, { "epoch": 1.698424606151538, "grad_norm": 0.17494481801986694, "learning_rate": 6.769837570674975e-06, "loss": 0.0472, "step": 2264 }, { "epoch": 1.6991747936984245, "grad_norm": 0.1976795345544815, "learning_rate": 6.7369753241798114e-06, "loss": 0.0379, "step": 2265 }, { "epoch": 1.6999249812453114, "grad_norm": 0.18811482191085815, "learning_rate": 6.70418726951178e-06, "loss": 0.0518, "step": 2266 }, { "epoch": 1.700675168792198, "grad_norm": 0.30055856704711914, "learning_rate": 6.671473462899181e-06, "loss": 0.061, "step": 2267 }, { "epoch": 1.7014253563390849, "grad_norm": 0.23877814412117004, "learning_rate": 6.638833960442948e-06, "loss": 0.0423, "step": 2268 }, { "epoch": 1.7021755438859714, "grad_norm": 0.16119158267974854, "learning_rate": 6.606268818116618e-06, "loss": 0.0397, "step": 2269 }, { "epoch": 1.7029257314328583, "grad_norm": 0.2363862246274948, "learning_rate": 6.573778091766219e-06, "loss": 0.0566, "step": 2270 }, { "epoch": 1.7036759189797448, "grad_norm": 0.21147215366363525, "learning_rate": 6.541361837110149e-06, "loss": 0.0564, "step": 2271 }, { "epoch": 1.7044261065266317, "grad_norm": 0.12778374552726746, "learning_rate": 6.509020109739078e-06, "loss": 0.0354, "step": 2272 }, { "epoch": 1.7051762940735182, "grad_norm": 0.2169772982597351, "learning_rate": 6.476752965115884e-06, "loss": 0.0418, "step": 2273 }, { "epoch": 1.7059264816204052, "grad_norm": 0.14315977692604065, "learning_rate": 6.444560458575544e-06, "loss": 0.0353, "step": 2274 }, { "epoch": 1.7066766691672917, "grad_norm": 0.1578858345746994, "learning_rate": 6.412442645325057e-06, "loss": 0.0338, "step": 2275 }, { "epoch": 1.7074268567141786, "grad_norm": 0.13712339103221893, "learning_rate": 6.38039958044328e-06, "loss": 0.0278, "step": 2276 }, { "epoch": 1.7081770442610653, "grad_norm": 0.19122560322284698, "learning_rate": 6.3484313188809265e-06, "loss": 0.0372, "step": 2277 }, { "epoch": 1.708927231807952, "grad_norm": 0.22988127171993256, "learning_rate": 6.316537915460418e-06, "loss": 0.0561, "step": 2278 }, { "epoch": 1.7096774193548387, "grad_norm": 0.13822755217552185, "learning_rate": 6.284719424875796e-06, "loss": 0.0279, "step": 2279 }, { "epoch": 1.7104276069017255, "grad_norm": 0.1539996862411499, "learning_rate": 6.252975901692659e-06, "loss": 0.0439, "step": 2280 }, { "epoch": 1.7111777944486122, "grad_norm": 0.15642352402210236, "learning_rate": 6.221307400347992e-06, "loss": 0.0254, "step": 2281 }, { "epoch": 1.711927981995499, "grad_norm": 0.1779630482196808, "learning_rate": 6.1897139751501796e-06, "loss": 0.0535, "step": 2282 }, { "epoch": 1.7126781695423856, "grad_norm": 0.18841272592544556, "learning_rate": 6.158195680278816e-06, "loss": 0.049, "step": 2283 }, { "epoch": 1.7134283570892723, "grad_norm": 0.2371242642402649, "learning_rate": 6.126752569784694e-06, "loss": 0.0438, "step": 2284 }, { "epoch": 1.714178544636159, "grad_norm": 0.15389996767044067, "learning_rate": 6.095384697589635e-06, "loss": 0.033, "step": 2285 }, { "epoch": 1.7149287321830458, "grad_norm": 0.14900945127010345, "learning_rate": 6.064092117486464e-06, "loss": 0.0337, "step": 2286 }, { "epoch": 1.7156789197299325, "grad_norm": 0.13682608306407928, "learning_rate": 6.032874883138867e-06, "loss": 0.0327, "step": 2287 }, { "epoch": 1.7164291072768192, "grad_norm": 0.1934865266084671, "learning_rate": 6.001733048081337e-06, "loss": 0.0405, "step": 2288 }, { "epoch": 1.717179294823706, "grad_norm": 0.2710997462272644, "learning_rate": 5.970666665719033e-06, "loss": 0.0511, "step": 2289 }, { "epoch": 1.7179294823705926, "grad_norm": 0.2753269374370575, "learning_rate": 5.939675789327759e-06, "loss": 0.0586, "step": 2290 }, { "epoch": 1.7186796699174793, "grad_norm": 0.19517338275909424, "learning_rate": 5.908760472053809e-06, "loss": 0.0648, "step": 2291 }, { "epoch": 1.719429857464366, "grad_norm": 0.21337710320949554, "learning_rate": 5.877920766913919e-06, "loss": 0.0405, "step": 2292 }, { "epoch": 1.7201800450112528, "grad_norm": 0.20285245776176453, "learning_rate": 5.847156726795133e-06, "loss": 0.0519, "step": 2293 }, { "epoch": 1.7209302325581395, "grad_norm": 0.19896537065505981, "learning_rate": 5.816468404454755e-06, "loss": 0.064, "step": 2294 }, { "epoch": 1.7216804201050264, "grad_norm": 0.17619459331035614, "learning_rate": 5.7858558525202336e-06, "loss": 0.0305, "step": 2295 }, { "epoch": 1.722430607651913, "grad_norm": 0.1564476639032364, "learning_rate": 5.755319123489083e-06, "loss": 0.0303, "step": 2296 }, { "epoch": 1.7231807951987999, "grad_norm": 0.13188333809375763, "learning_rate": 5.724858269728789e-06, "loss": 0.0277, "step": 2297 }, { "epoch": 1.7239309827456863, "grad_norm": 0.24183008074760437, "learning_rate": 5.694473343476714e-06, "loss": 0.0393, "step": 2298 }, { "epoch": 1.7246811702925733, "grad_norm": 0.2262912541627884, "learning_rate": 5.664164396840016e-06, "loss": 0.0579, "step": 2299 }, { "epoch": 1.7254313578394598, "grad_norm": 0.19143463671207428, "learning_rate": 5.633931481795552e-06, "loss": 0.037, "step": 2300 }, { "epoch": 1.7261815453863467, "grad_norm": 0.1472475230693817, "learning_rate": 5.603774650189808e-06, "loss": 0.042, "step": 2301 }, { "epoch": 1.7269317329332332, "grad_norm": 0.20033307373523712, "learning_rate": 5.573693953738751e-06, "loss": 0.0484, "step": 2302 }, { "epoch": 1.7276819204801201, "grad_norm": 0.15765565633773804, "learning_rate": 5.543689444027839e-06, "loss": 0.0344, "step": 2303 }, { "epoch": 1.7284321080270066, "grad_norm": 0.13256795704364777, "learning_rate": 5.513761172511833e-06, "loss": 0.0288, "step": 2304 }, { "epoch": 1.7291822955738936, "grad_norm": 0.24713698029518127, "learning_rate": 5.483909190514797e-06, "loss": 0.0482, "step": 2305 }, { "epoch": 1.72993248312078, "grad_norm": 0.19771939516067505, "learning_rate": 5.4541335492299115e-06, "loss": 0.0409, "step": 2306 }, { "epoch": 1.730682670667667, "grad_norm": 0.1739976406097412, "learning_rate": 5.424434299719483e-06, "loss": 0.034, "step": 2307 }, { "epoch": 1.7314328582145535, "grad_norm": 0.13870416581630707, "learning_rate": 5.394811492914803e-06, "loss": 0.0357, "step": 2308 }, { "epoch": 1.7321830457614404, "grad_norm": 0.15554337203502655, "learning_rate": 5.365265179616063e-06, "loss": 0.0295, "step": 2309 }, { "epoch": 1.732933233308327, "grad_norm": 0.21074527502059937, "learning_rate": 5.3357954104922895e-06, "loss": 0.0365, "step": 2310 }, { "epoch": 1.7336834208552139, "grad_norm": 0.22298146784305573, "learning_rate": 5.306402236081209e-06, "loss": 0.0796, "step": 2311 }, { "epoch": 1.7344336084021004, "grad_norm": 0.12427302449941635, "learning_rate": 5.277085706789248e-06, "loss": 0.026, "step": 2312 }, { "epoch": 1.7351837959489873, "grad_norm": 0.20607198774814606, "learning_rate": 5.247845872891371e-06, "loss": 0.0478, "step": 2313 }, { "epoch": 1.735933983495874, "grad_norm": 0.22108227014541626, "learning_rate": 5.218682784530993e-06, "loss": 0.05, "step": 2314 }, { "epoch": 1.7366841710427607, "grad_norm": 0.17740093171596527, "learning_rate": 5.1895964917199445e-06, "loss": 0.0367, "step": 2315 }, { "epoch": 1.7374343585896475, "grad_norm": 0.2420615255832672, "learning_rate": 5.160587044338355e-06, "loss": 0.0568, "step": 2316 }, { "epoch": 1.7381845461365342, "grad_norm": 0.14381897449493408, "learning_rate": 5.131654492134574e-06, "loss": 0.0402, "step": 2317 }, { "epoch": 1.7389347336834209, "grad_norm": 0.1720701903104782, "learning_rate": 5.102798884725091e-06, "loss": 0.037, "step": 2318 }, { "epoch": 1.7396849212303076, "grad_norm": 0.14338922500610352, "learning_rate": 5.074020271594404e-06, "loss": 0.0385, "step": 2319 }, { "epoch": 1.7404351087771943, "grad_norm": 0.19156095385551453, "learning_rate": 5.045318702095014e-06, "loss": 0.0549, "step": 2320 }, { "epoch": 1.741185296324081, "grad_norm": 0.18174941837787628, "learning_rate": 5.016694225447288e-06, "loss": 0.04, "step": 2321 }, { "epoch": 1.7419354838709677, "grad_norm": 0.21480074524879456, "learning_rate": 4.988146890739381e-06, "loss": 0.0524, "step": 2322 }, { "epoch": 1.7426856714178545, "grad_norm": 0.14862346649169922, "learning_rate": 4.959676746927172e-06, "loss": 0.0345, "step": 2323 }, { "epoch": 1.7434358589647412, "grad_norm": 0.3776375651359558, "learning_rate": 4.931283842834139e-06, "loss": 0.033, "step": 2324 }, { "epoch": 1.744186046511628, "grad_norm": 0.21091584861278534, "learning_rate": 4.902968227151311e-06, "loss": 0.0448, "step": 2325 }, { "epoch": 1.7449362340585146, "grad_norm": 0.13311506807804108, "learning_rate": 4.874729948437218e-06, "loss": 0.0267, "step": 2326 }, { "epoch": 1.7456864216054013, "grad_norm": 0.1780560463666916, "learning_rate": 4.846569055117684e-06, "loss": 0.0307, "step": 2327 }, { "epoch": 1.746436609152288, "grad_norm": 0.19919991493225098, "learning_rate": 4.818485595485889e-06, "loss": 0.0704, "step": 2328 }, { "epoch": 1.7471867966991748, "grad_norm": 0.19128254055976868, "learning_rate": 4.790479617702198e-06, "loss": 0.051, "step": 2329 }, { "epoch": 1.7479369842460615, "grad_norm": 0.16919958591461182, "learning_rate": 4.762551169794105e-06, "loss": 0.0369, "step": 2330 }, { "epoch": 1.7486871717929482, "grad_norm": 0.19580717384815216, "learning_rate": 4.734700299656158e-06, "loss": 0.041, "step": 2331 }, { "epoch": 1.749437359339835, "grad_norm": 0.20923122763633728, "learning_rate": 4.706927055049837e-06, "loss": 0.0601, "step": 2332 }, { "epoch": 1.7501875468867216, "grad_norm": 0.157396599650383, "learning_rate": 4.6792314836035304e-06, "loss": 0.0315, "step": 2333 }, { "epoch": 1.7509377344336086, "grad_norm": 0.16064214706420898, "learning_rate": 4.651613632812413e-06, "loss": 0.0391, "step": 2334 }, { "epoch": 1.751687921980495, "grad_norm": 0.20322883129119873, "learning_rate": 4.624073550038399e-06, "loss": 0.0431, "step": 2335 }, { "epoch": 1.752438109527382, "grad_norm": 0.21442653238773346, "learning_rate": 4.596611282509989e-06, "loss": 0.0483, "step": 2336 }, { "epoch": 1.7531882970742685, "grad_norm": 0.16077186167240143, "learning_rate": 4.56922687732228e-06, "loss": 0.0501, "step": 2337 }, { "epoch": 1.7539384846211554, "grad_norm": 0.15106667578220367, "learning_rate": 4.5419203814368376e-06, "loss": 0.0357, "step": 2338 }, { "epoch": 1.754688672168042, "grad_norm": 0.1335969716310501, "learning_rate": 4.514691841681601e-06, "loss": 0.0464, "step": 2339 }, { "epoch": 1.7554388597149289, "grad_norm": 0.21206989884376526, "learning_rate": 4.487541304750848e-06, "loss": 0.0368, "step": 2340 }, { "epoch": 1.7561890472618154, "grad_norm": 0.1703171730041504, "learning_rate": 4.4604688172050605e-06, "loss": 0.0515, "step": 2341 }, { "epoch": 1.7569392348087023, "grad_norm": 0.26870593428611755, "learning_rate": 4.433474425470902e-06, "loss": 0.0706, "step": 2342 }, { "epoch": 1.7576894223555888, "grad_norm": 0.21271054446697235, "learning_rate": 4.406558175841097e-06, "loss": 0.0601, "step": 2343 }, { "epoch": 1.7584396099024757, "grad_norm": 0.19085806608200073, "learning_rate": 4.379720114474351e-06, "loss": 0.0365, "step": 2344 }, { "epoch": 1.7591897974493622, "grad_norm": 0.1780657321214676, "learning_rate": 4.352960287395303e-06, "loss": 0.0348, "step": 2345 }, { "epoch": 1.7599399849962492, "grad_norm": 0.14623548090457916, "learning_rate": 4.3262787404944165e-06, "loss": 0.0317, "step": 2346 }, { "epoch": 1.7606901725431356, "grad_norm": 0.13500511646270752, "learning_rate": 4.299675519527929e-06, "loss": 0.0256, "step": 2347 }, { "epoch": 1.7614403600900226, "grad_norm": 0.2227623611688614, "learning_rate": 4.273150670117743e-06, "loss": 0.0452, "step": 2348 }, { "epoch": 1.762190547636909, "grad_norm": 0.15160562098026276, "learning_rate": 4.246704237751342e-06, "loss": 0.0331, "step": 2349 }, { "epoch": 1.762940735183796, "grad_norm": 0.20232978463172913, "learning_rate": 4.220336267781777e-06, "loss": 0.0396, "step": 2350 }, { "epoch": 1.7636909227306825, "grad_norm": 0.1632014513015747, "learning_rate": 4.19404680542751e-06, "loss": 0.0444, "step": 2351 }, { "epoch": 1.7644411102775694, "grad_norm": 0.1549803912639618, "learning_rate": 4.167835895772382e-06, "loss": 0.0361, "step": 2352 }, { "epoch": 1.7651912978244562, "grad_norm": 0.24175381660461426, "learning_rate": 4.141703583765522e-06, "loss": 0.0434, "step": 2353 }, { "epoch": 1.7659414853713429, "grad_norm": 0.15828168392181396, "learning_rate": 4.11564991422127e-06, "loss": 0.0329, "step": 2354 }, { "epoch": 1.7666916729182296, "grad_norm": 0.26973676681518555, "learning_rate": 4.0896749318191095e-06, "loss": 0.0677, "step": 2355 }, { "epoch": 1.7674418604651163, "grad_norm": 0.2405744045972824, "learning_rate": 4.06377868110358e-06, "loss": 0.0429, "step": 2356 }, { "epoch": 1.768192048012003, "grad_norm": 0.1941184252500534, "learning_rate": 4.037961206484186e-06, "loss": 0.0471, "step": 2357 }, { "epoch": 1.7689422355588897, "grad_norm": 0.2016299068927765, "learning_rate": 4.0122225522353675e-06, "loss": 0.0427, "step": 2358 }, { "epoch": 1.7696924231057765, "grad_norm": 0.21357449889183044, "learning_rate": 3.986562762496376e-06, "loss": 0.0443, "step": 2359 }, { "epoch": 1.7704426106526632, "grad_norm": 0.279895156621933, "learning_rate": 3.9609818812712255e-06, "loss": 0.0583, "step": 2360 }, { "epoch": 1.77119279819955, "grad_norm": 0.132369726896286, "learning_rate": 3.935479952428611e-06, "loss": 0.0326, "step": 2361 }, { "epoch": 1.7719429857464366, "grad_norm": 0.20074398815631866, "learning_rate": 3.91005701970183e-06, "loss": 0.0426, "step": 2362 }, { "epoch": 1.7726931732933233, "grad_norm": 0.19614775478839874, "learning_rate": 3.8847131266886935e-06, "loss": 0.0303, "step": 2363 }, { "epoch": 1.77344336084021, "grad_norm": 0.14640925824642181, "learning_rate": 3.859448316851505e-06, "loss": 0.0429, "step": 2364 }, { "epoch": 1.7741935483870968, "grad_norm": 0.22066819667816162, "learning_rate": 3.834262633516916e-06, "loss": 0.0357, "step": 2365 }, { "epoch": 1.7749437359339835, "grad_norm": 0.17144808173179626, "learning_rate": 3.8091561198758897e-06, "loss": 0.0365, "step": 2366 }, { "epoch": 1.7756939234808702, "grad_norm": 0.16311363875865936, "learning_rate": 3.784128818983618e-06, "loss": 0.0312, "step": 2367 }, { "epoch": 1.776444111027757, "grad_norm": 0.14726579189300537, "learning_rate": 3.7591807737594743e-06, "loss": 0.033, "step": 2368 }, { "epoch": 1.7771942985746436, "grad_norm": 0.18850120902061462, "learning_rate": 3.734312026986897e-06, "loss": 0.0429, "step": 2369 }, { "epoch": 1.7779444861215303, "grad_norm": 0.19177739322185516, "learning_rate": 3.7095226213133272e-06, "loss": 0.059, "step": 2370 }, { "epoch": 1.7786946736684173, "grad_norm": 0.14402103424072266, "learning_rate": 3.6848125992501592e-06, "loss": 0.044, "step": 2371 }, { "epoch": 1.7794448612153038, "grad_norm": 0.18489781022071838, "learning_rate": 3.6601820031726517e-06, "loss": 0.0368, "step": 2372 }, { "epoch": 1.7801950487621907, "grad_norm": 0.1771378219127655, "learning_rate": 3.6356308753198454e-06, "loss": 0.0348, "step": 2373 }, { "epoch": 1.7809452363090772, "grad_norm": 0.1410515457391739, "learning_rate": 3.6111592577945217e-06, "loss": 0.032, "step": 2374 }, { "epoch": 1.7816954238559641, "grad_norm": 0.18638338148593903, "learning_rate": 3.586767192563073e-06, "loss": 0.0566, "step": 2375 }, { "epoch": 1.7824456114028506, "grad_norm": 0.21808239817619324, "learning_rate": 3.562454721455505e-06, "loss": 0.0447, "step": 2376 }, { "epoch": 1.7831957989497376, "grad_norm": 0.17375057935714722, "learning_rate": 3.538221886165299e-06, "loss": 0.0377, "step": 2377 }, { "epoch": 1.783945986496624, "grad_norm": 0.1853272020816803, "learning_rate": 3.514068728249398e-06, "loss": 0.0344, "step": 2378 }, { "epoch": 1.784696174043511, "grad_norm": 0.2159195989370346, "learning_rate": 3.489995289128073e-06, "loss": 0.0672, "step": 2379 }, { "epoch": 1.7854463615903975, "grad_norm": 0.1763429343700409, "learning_rate": 3.4660016100849126e-06, "loss": 0.0539, "step": 2380 }, { "epoch": 1.7861965491372844, "grad_norm": 0.13722042739391327, "learning_rate": 3.442087732266697e-06, "loss": 0.039, "step": 2381 }, { "epoch": 1.786946736684171, "grad_norm": 0.180751770734787, "learning_rate": 3.418253696683399e-06, "loss": 0.0433, "step": 2382 }, { "epoch": 1.7876969242310579, "grad_norm": 0.23207427561283112, "learning_rate": 3.3944995442080185e-06, "loss": 0.043, "step": 2383 }, { "epoch": 1.7884471117779444, "grad_norm": 0.1683093160390854, "learning_rate": 3.3708253155766033e-06, "loss": 0.0429, "step": 2384 }, { "epoch": 1.7891972993248313, "grad_norm": 0.19512560963630676, "learning_rate": 3.347231051388117e-06, "loss": 0.0385, "step": 2385 }, { "epoch": 1.7899474868717178, "grad_norm": 0.152205228805542, "learning_rate": 3.323716792104403e-06, "loss": 0.0306, "step": 2386 }, { "epoch": 1.7906976744186047, "grad_norm": 0.18546339869499207, "learning_rate": 3.3002825780500957e-06, "loss": 0.0524, "step": 2387 }, { "epoch": 1.7914478619654912, "grad_norm": 0.14806154370307922, "learning_rate": 3.276928449412564e-06, "loss": 0.0349, "step": 2388 }, { "epoch": 1.7921980495123782, "grad_norm": 0.2536125183105469, "learning_rate": 3.253654446241844e-06, "loss": 0.0513, "step": 2389 }, { "epoch": 1.7929482370592649, "grad_norm": 0.16887833178043365, "learning_rate": 3.2304606084505585e-06, "loss": 0.032, "step": 2390 }, { "epoch": 1.7936984246061516, "grad_norm": 0.2701931297779083, "learning_rate": 3.2073469758138577e-06, "loss": 0.0623, "step": 2391 }, { "epoch": 1.7944486121530383, "grad_norm": 0.14021989703178406, "learning_rate": 3.18431358796934e-06, "loss": 0.0415, "step": 2392 }, { "epoch": 1.795198799699925, "grad_norm": 0.14851094782352448, "learning_rate": 3.161360484416992e-06, "loss": 0.0322, "step": 2393 }, { "epoch": 1.7959489872468117, "grad_norm": 0.13025152683258057, "learning_rate": 3.1384877045191384e-06, "loss": 0.0304, "step": 2394 }, { "epoch": 1.7966991747936985, "grad_norm": 0.156709685921669, "learning_rate": 3.1156952875003365e-06, "loss": 0.0383, "step": 2395 }, { "epoch": 1.7974493623405852, "grad_norm": 0.12184840440750122, "learning_rate": 3.0929832724473416e-06, "loss": 0.0297, "step": 2396 }, { "epoch": 1.7981995498874719, "grad_norm": 0.20353339612483978, "learning_rate": 3.0703516983090207e-06, "loss": 0.0351, "step": 2397 }, { "epoch": 1.7989497374343586, "grad_norm": 0.26167547702789307, "learning_rate": 3.0478006038962947e-06, "loss": 0.0587, "step": 2398 }, { "epoch": 1.7996999249812453, "grad_norm": 0.1540125608444214, "learning_rate": 3.0253300278820783e-06, "loss": 0.0255, "step": 2399 }, { "epoch": 1.800450112528132, "grad_norm": 0.1433778554201126, "learning_rate": 3.002940008801186e-06, "loss": 0.0317, "step": 2400 }, { "epoch": 1.800450112528132, "eval_loss": 0.06825650483369827, "eval_runtime": 2.6517, "eval_samples_per_second": 20.364, "eval_steps_per_second": 5.28, "step": 2400 }, { "epoch": 1.8012003000750187, "grad_norm": 0.22410492599010468, "learning_rate": 2.9806305850502923e-06, "loss": 0.0542, "step": 2401 }, { "epoch": 1.8019504876219055, "grad_norm": 0.18377956748008728, "learning_rate": 2.9584017948878717e-06, "loss": 0.0591, "step": 2402 }, { "epoch": 1.8027006751687922, "grad_norm": 0.24742087721824646, "learning_rate": 2.9362536764341085e-06, "loss": 0.0444, "step": 2403 }, { "epoch": 1.803450862715679, "grad_norm": 0.25326278805732727, "learning_rate": 2.9141862676708486e-06, "loss": 0.1, "step": 2404 }, { "epoch": 1.8042010502625656, "grad_norm": 0.2263568788766861, "learning_rate": 2.8921996064415147e-06, "loss": 0.046, "step": 2405 }, { "epoch": 1.8049512378094523, "grad_norm": 0.2560195326805115, "learning_rate": 2.870293730451068e-06, "loss": 0.0505, "step": 2406 }, { "epoch": 1.805701425356339, "grad_norm": 0.1759989708662033, "learning_rate": 2.8484686772659308e-06, "loss": 0.0553, "step": 2407 }, { "epoch": 1.8064516129032258, "grad_norm": 0.18385818600654602, "learning_rate": 2.826724484313925e-06, "loss": 0.0432, "step": 2408 }, { "epoch": 1.8072018004501125, "grad_norm": 0.16650594770908356, "learning_rate": 2.8050611888841947e-06, "loss": 0.0517, "step": 2409 }, { "epoch": 1.8079519879969994, "grad_norm": 0.13436725735664368, "learning_rate": 2.7834788281271616e-06, "loss": 0.0384, "step": 2410 }, { "epoch": 1.808702175543886, "grad_norm": 0.18747705221176147, "learning_rate": 2.7619774390544473e-06, "loss": 0.0517, "step": 2411 }, { "epoch": 1.8094523630907728, "grad_norm": 0.13398858904838562, "learning_rate": 2.740557058538823e-06, "loss": 0.0302, "step": 2412 }, { "epoch": 1.8102025506376593, "grad_norm": 0.16094616055488586, "learning_rate": 2.7192177233141215e-06, "loss": 0.04, "step": 2413 }, { "epoch": 1.8109527381845463, "grad_norm": 0.2032438963651657, "learning_rate": 2.697959469975203e-06, "loss": 0.0617, "step": 2414 }, { "epoch": 1.8117029257314328, "grad_norm": 0.19508475065231323, "learning_rate": 2.6767823349778843e-06, "loss": 0.0347, "step": 2415 }, { "epoch": 1.8124531132783197, "grad_norm": 0.16064900159835815, "learning_rate": 2.65568635463887e-06, "loss": 0.0547, "step": 2416 }, { "epoch": 1.8132033008252062, "grad_norm": 0.15190844237804413, "learning_rate": 2.634671565135677e-06, "loss": 0.0505, "step": 2417 }, { "epoch": 1.8139534883720931, "grad_norm": 0.19380758702754974, "learning_rate": 2.613738002506605e-06, "loss": 0.0381, "step": 2418 }, { "epoch": 1.8147036759189796, "grad_norm": 0.19494427740573883, "learning_rate": 2.592885702650655e-06, "loss": 0.0454, "step": 2419 }, { "epoch": 1.8154538634658666, "grad_norm": 0.21446380019187927, "learning_rate": 2.572114701327466e-06, "loss": 0.0406, "step": 2420 }, { "epoch": 1.816204051012753, "grad_norm": 0.18711164593696594, "learning_rate": 2.551425034157262e-06, "loss": 0.036, "step": 2421 }, { "epoch": 1.81695423855964, "grad_norm": 0.18074631690979004, "learning_rate": 2.5308167366207724e-06, "loss": 0.0567, "step": 2422 }, { "epoch": 1.8177044261065265, "grad_norm": 0.13808584213256836, "learning_rate": 2.510289844059216e-06, "loss": 0.0336, "step": 2423 }, { "epoch": 1.8184546136534134, "grad_norm": 0.13104617595672607, "learning_rate": 2.48984439167419e-06, "loss": 0.0345, "step": 2424 }, { "epoch": 1.8192048012003, "grad_norm": 0.1713109016418457, "learning_rate": 2.4694804145276305e-06, "loss": 0.0499, "step": 2425 }, { "epoch": 1.8199549887471869, "grad_norm": 0.21308591961860657, "learning_rate": 2.449197947541737e-06, "loss": 0.0423, "step": 2426 }, { "epoch": 1.8207051762940734, "grad_norm": 0.27069994807243347, "learning_rate": 2.4289970254989635e-06, "loss": 0.0557, "step": 2427 }, { "epoch": 1.8214553638409603, "grad_norm": 0.2805314362049103, "learning_rate": 2.408877683041888e-06, "loss": 0.051, "step": 2428 }, { "epoch": 1.822205551387847, "grad_norm": 0.15674597024917603, "learning_rate": 2.388839954673222e-06, "loss": 0.0334, "step": 2429 }, { "epoch": 1.8229557389347337, "grad_norm": 0.17512424290180206, "learning_rate": 2.3688838747556674e-06, "loss": 0.0473, "step": 2430 }, { "epoch": 1.8237059264816204, "grad_norm": 0.15832626819610596, "learning_rate": 2.3490094775119597e-06, "loss": 0.0324, "step": 2431 }, { "epoch": 1.8244561140285072, "grad_norm": 0.1268201768398285, "learning_rate": 2.3292167970247193e-06, "loss": 0.0212, "step": 2432 }, { "epoch": 1.8252063015753939, "grad_norm": 0.2088364213705063, "learning_rate": 2.30950586723645e-06, "loss": 0.0473, "step": 2433 }, { "epoch": 1.8259564891222806, "grad_norm": 0.14922069013118744, "learning_rate": 2.2898767219494634e-06, "loss": 0.0403, "step": 2434 }, { "epoch": 1.8267066766691673, "grad_norm": 0.2907104194164276, "learning_rate": 2.270329394825793e-06, "loss": 0.0745, "step": 2435 }, { "epoch": 1.827456864216054, "grad_norm": 0.17265377938747406, "learning_rate": 2.2508639193871805e-06, "loss": 0.0498, "step": 2436 }, { "epoch": 1.8282070517629407, "grad_norm": 0.18557190895080566, "learning_rate": 2.2314803290150287e-06, "loss": 0.0367, "step": 2437 }, { "epoch": 1.8289572393098275, "grad_norm": 0.18080484867095947, "learning_rate": 2.2121786569502535e-06, "loss": 0.0378, "step": 2438 }, { "epoch": 1.8297074268567142, "grad_norm": 0.12578532099723816, "learning_rate": 2.192958936293338e-06, "loss": 0.0325, "step": 2439 }, { "epoch": 1.8304576144036009, "grad_norm": 0.19788827002048492, "learning_rate": 2.1738212000042e-06, "loss": 0.0461, "step": 2440 }, { "epoch": 1.8312078019504876, "grad_norm": 0.22011855244636536, "learning_rate": 2.1547654809021877e-06, "loss": 0.036, "step": 2441 }, { "epoch": 1.8319579894973743, "grad_norm": 0.14238829910755157, "learning_rate": 2.135791811665977e-06, "loss": 0.0434, "step": 2442 }, { "epoch": 1.832708177044261, "grad_norm": 0.14220482110977173, "learning_rate": 2.1169002248335346e-06, "loss": 0.031, "step": 2443 }, { "epoch": 1.8334583645911477, "grad_norm": 0.1597825586795807, "learning_rate": 2.098090752802073e-06, "loss": 0.0429, "step": 2444 }, { "epoch": 1.8342085521380345, "grad_norm": 0.19599536061286926, "learning_rate": 2.0793634278279907e-06, "loss": 0.0487, "step": 2445 }, { "epoch": 1.8349587396849212, "grad_norm": 0.27415987849235535, "learning_rate": 2.0607182820268133e-06, "loss": 0.0499, "step": 2446 }, { "epoch": 1.835708927231808, "grad_norm": 0.1647443026304245, "learning_rate": 2.042155347373109e-06, "loss": 0.0412, "step": 2447 }, { "epoch": 1.8364591147786946, "grad_norm": 0.22096386551856995, "learning_rate": 2.023674655700497e-06, "loss": 0.0383, "step": 2448 }, { "epoch": 1.8372093023255816, "grad_norm": 0.23002421855926514, "learning_rate": 2.0052762387015424e-06, "loss": 0.0431, "step": 2449 }, { "epoch": 1.837959489872468, "grad_norm": 0.18029820919036865, "learning_rate": 1.986960127927717e-06, "loss": 0.0334, "step": 2450 }, { "epoch": 1.838709677419355, "grad_norm": 0.20091702044010162, "learning_rate": 1.9687263547893407e-06, "loss": 0.0469, "step": 2451 }, { "epoch": 1.8394598649662415, "grad_norm": 0.2061494141817093, "learning_rate": 1.9505749505555503e-06, "loss": 0.0374, "step": 2452 }, { "epoch": 1.8402100525131284, "grad_norm": 0.20793215930461884, "learning_rate": 1.932505946354213e-06, "loss": 0.0526, "step": 2453 }, { "epoch": 1.840960240060015, "grad_norm": 0.19572290778160095, "learning_rate": 1.9145193731718858e-06, "loss": 0.04, "step": 2454 }, { "epoch": 1.8417104276069018, "grad_norm": 0.3863771855831146, "learning_rate": 1.8966152618537846e-06, "loss": 0.0791, "step": 2455 }, { "epoch": 1.8424606151537883, "grad_norm": 0.22557200491428375, "learning_rate": 1.8787936431036824e-06, "loss": 0.0407, "step": 2456 }, { "epoch": 1.8432108027006753, "grad_norm": 0.2332926243543625, "learning_rate": 1.8610545474839036e-06, "loss": 0.0614, "step": 2457 }, { "epoch": 1.8439609902475618, "grad_norm": 0.1692853420972824, "learning_rate": 1.8433980054152533e-06, "loss": 0.054, "step": 2458 }, { "epoch": 1.8447111777944487, "grad_norm": 0.1144590973854065, "learning_rate": 1.8258240471769662e-06, "loss": 0.0306, "step": 2459 }, { "epoch": 1.8454613653413352, "grad_norm": 0.2206297367811203, "learning_rate": 1.8083327029066399e-06, "loss": 0.0582, "step": 2460 }, { "epoch": 1.8462115528882221, "grad_norm": 0.20330913364887238, "learning_rate": 1.7909240026002138e-06, "loss": 0.0565, "step": 2461 }, { "epoch": 1.8469617404351086, "grad_norm": 0.15369729697704315, "learning_rate": 1.773597976111896e-06, "loss": 0.0292, "step": 2462 }, { "epoch": 1.8477119279819956, "grad_norm": 0.1898067593574524, "learning_rate": 1.7563546531541132e-06, "loss": 0.0451, "step": 2463 }, { "epoch": 1.848462115528882, "grad_norm": 0.14799977838993073, "learning_rate": 1.7391940632974667e-06, "loss": 0.0399, "step": 2464 }, { "epoch": 1.849212303075769, "grad_norm": 0.20645496249198914, "learning_rate": 1.7221162359706776e-06, "loss": 0.0455, "step": 2465 }, { "epoch": 1.8499624906226555, "grad_norm": 0.22738315165042877, "learning_rate": 1.705121200460541e-06, "loss": 0.0611, "step": 2466 }, { "epoch": 1.8507126781695424, "grad_norm": 0.24919851124286652, "learning_rate": 1.6882089859118766e-06, "loss": 0.0677, "step": 2467 }, { "epoch": 1.8514628657164292, "grad_norm": 0.18418778479099274, "learning_rate": 1.6713796213274457e-06, "loss": 0.0346, "step": 2468 }, { "epoch": 1.8522130532633159, "grad_norm": 0.20278531312942505, "learning_rate": 1.6546331355679623e-06, "loss": 0.0409, "step": 2469 }, { "epoch": 1.8529632408102026, "grad_norm": 0.14719854295253754, "learning_rate": 1.6379695573520093e-06, "loss": 0.0343, "step": 2470 }, { "epoch": 1.8537134283570893, "grad_norm": 0.210566908121109, "learning_rate": 1.621388915255967e-06, "loss": 0.0559, "step": 2471 }, { "epoch": 1.854463615903976, "grad_norm": 0.14764787256717682, "learning_rate": 1.604891237714018e-06, "loss": 0.0371, "step": 2472 }, { "epoch": 1.8552138034508627, "grad_norm": 0.14555113017559052, "learning_rate": 1.5884765530180478e-06, "loss": 0.0334, "step": 2473 }, { "epoch": 1.8559639909977494, "grad_norm": 0.17617011070251465, "learning_rate": 1.5721448893176228e-06, "loss": 0.0324, "step": 2474 }, { "epoch": 1.8567141785446362, "grad_norm": 0.18802230060100555, "learning_rate": 1.5558962746199335e-06, "loss": 0.0385, "step": 2475 }, { "epoch": 1.8574643660915229, "grad_norm": 0.1850004941225052, "learning_rate": 1.5397307367897684e-06, "loss": 0.0481, "step": 2476 }, { "epoch": 1.8582145536384096, "grad_norm": 0.19387805461883545, "learning_rate": 1.5236483035494297e-06, "loss": 0.0421, "step": 2477 }, { "epoch": 1.8589647411852963, "grad_norm": 0.1989627182483673, "learning_rate": 1.5076490024786893e-06, "loss": 0.0485, "step": 2478 }, { "epoch": 1.859714928732183, "grad_norm": 0.17542366683483124, "learning_rate": 1.4917328610147885e-06, "loss": 0.0465, "step": 2479 }, { "epoch": 1.8604651162790697, "grad_norm": 0.24785301089286804, "learning_rate": 1.4758999064523493e-06, "loss": 0.0493, "step": 2480 }, { "epoch": 1.8612153038259565, "grad_norm": 0.18737287819385529, "learning_rate": 1.4601501659433137e-06, "loss": 0.0435, "step": 2481 }, { "epoch": 1.8619654913728432, "grad_norm": 0.19134843349456787, "learning_rate": 1.444483666496943e-06, "loss": 0.0556, "step": 2482 }, { "epoch": 1.86271567891973, "grad_norm": 0.1329682469367981, "learning_rate": 1.4289004349797409e-06, "loss": 0.0333, "step": 2483 }, { "epoch": 1.8634658664666166, "grad_norm": 0.18284755945205688, "learning_rate": 1.4134004981154137e-06, "loss": 0.0554, "step": 2484 }, { "epoch": 1.8642160540135033, "grad_norm": 0.21650445461273193, "learning_rate": 1.3979838824848378e-06, "loss": 0.061, "step": 2485 }, { "epoch": 1.8649662415603903, "grad_norm": 0.22950206696987152, "learning_rate": 1.382650614525971e-06, "loss": 0.0483, "step": 2486 }, { "epoch": 1.8657164291072768, "grad_norm": 0.20865564048290253, "learning_rate": 1.3674007205338678e-06, "loss": 0.0511, "step": 2487 }, { "epoch": 1.8664666166541637, "grad_norm": 0.16846178472042084, "learning_rate": 1.3522342266605925e-06, "loss": 0.037, "step": 2488 }, { "epoch": 1.8672168042010502, "grad_norm": 0.19286170601844788, "learning_rate": 1.3371511589152008e-06, "loss": 0.0417, "step": 2489 }, { "epoch": 1.8679669917479371, "grad_norm": 0.12854991853237152, "learning_rate": 1.3221515431636522e-06, "loss": 0.0246, "step": 2490 }, { "epoch": 1.8687171792948236, "grad_norm": 0.2654399275779724, "learning_rate": 1.307235405128815e-06, "loss": 0.0648, "step": 2491 }, { "epoch": 1.8694673668417106, "grad_norm": 0.24505434930324554, "learning_rate": 1.292402770390394e-06, "loss": 0.0498, "step": 2492 }, { "epoch": 1.870217554388597, "grad_norm": 0.14539310336112976, "learning_rate": 1.2776536643849145e-06, "loss": 0.0391, "step": 2493 }, { "epoch": 1.870967741935484, "grad_norm": 0.15054713189601898, "learning_rate": 1.2629881124056274e-06, "loss": 0.0292, "step": 2494 }, { "epoch": 1.8717179294823705, "grad_norm": 0.228153795003891, "learning_rate": 1.2484061396025038e-06, "loss": 0.047, "step": 2495 }, { "epoch": 1.8724681170292574, "grad_norm": 0.16526199877262115, "learning_rate": 1.2339077709822067e-06, "loss": 0.0393, "step": 2496 }, { "epoch": 1.873218304576144, "grad_norm": 0.19541572034358978, "learning_rate": 1.2194930314080032e-06, "loss": 0.0607, "step": 2497 }, { "epoch": 1.8739684921230308, "grad_norm": 0.39976009726524353, "learning_rate": 1.2051619455997476e-06, "loss": 0.0486, "step": 2498 }, { "epoch": 1.8747186796699173, "grad_norm": 0.5087877511978149, "learning_rate": 1.1909145381338472e-06, "loss": 0.0762, "step": 2499 }, { "epoch": 1.8754688672168043, "grad_norm": 0.2975837290287018, "learning_rate": 1.1767508334431964e-06, "loss": 0.0469, "step": 2500 }, { "epoch": 1.8762190547636908, "grad_norm": 0.1391455978155136, "learning_rate": 1.1626708558171606e-06, "loss": 0.0439, "step": 2501 }, { "epoch": 1.8769692423105777, "grad_norm": 0.19531936943531036, "learning_rate": 1.1486746294015193e-06, "loss": 0.0499, "step": 2502 }, { "epoch": 1.8777194298574642, "grad_norm": 0.1554703414440155, "learning_rate": 1.134762178198412e-06, "loss": 0.0311, "step": 2503 }, { "epoch": 1.8784696174043511, "grad_norm": 0.22662873566150665, "learning_rate": 1.1209335260663256e-06, "loss": 0.0577, "step": 2504 }, { "epoch": 1.8792198049512379, "grad_norm": 0.17107880115509033, "learning_rate": 1.1071886967200352e-06, "loss": 0.0569, "step": 2505 }, { "epoch": 1.8799699924981246, "grad_norm": 0.1529870182275772, "learning_rate": 1.0935277137305744e-06, "loss": 0.0281, "step": 2506 }, { "epoch": 1.8807201800450113, "grad_norm": 0.17268817126750946, "learning_rate": 1.0799506005251814e-06, "loss": 0.0457, "step": 2507 }, { "epoch": 1.881470367591898, "grad_norm": 0.198935866355896, "learning_rate": 1.06645738038727e-06, "loss": 0.0576, "step": 2508 }, { "epoch": 1.8822205551387847, "grad_norm": 0.17006203532218933, "learning_rate": 1.053048076456381e-06, "loss": 0.0376, "step": 2509 }, { "epoch": 1.8829707426856714, "grad_norm": 0.17668931186199188, "learning_rate": 1.0397227117281528e-06, "loss": 0.0359, "step": 2510 }, { "epoch": 1.8837209302325582, "grad_norm": 0.17520001530647278, "learning_rate": 1.0264813090542725e-06, "loss": 0.0323, "step": 2511 }, { "epoch": 1.8844711177794449, "grad_norm": 0.1917439103126526, "learning_rate": 1.0133238911424426e-06, "loss": 0.0444, "step": 2512 }, { "epoch": 1.8852213053263316, "grad_norm": 0.21084263920783997, "learning_rate": 1.0002504805563362e-06, "loss": 0.0282, "step": 2513 }, { "epoch": 1.8859714928732183, "grad_norm": 0.12074773758649826, "learning_rate": 9.872610997155695e-07, "loss": 0.025, "step": 2514 }, { "epoch": 1.886721680420105, "grad_norm": 0.143777534365654, "learning_rate": 9.743557708956575e-07, "loss": 0.03, "step": 2515 }, { "epoch": 1.8874718679669917, "grad_norm": 0.16410160064697266, "learning_rate": 9.615345162279521e-07, "loss": 0.0295, "step": 2516 }, { "epoch": 1.8882220555138785, "grad_norm": 0.3826543688774109, "learning_rate": 9.48797357699649e-07, "loss": 0.0719, "step": 2517 }, { "epoch": 1.8889722430607652, "grad_norm": 0.17290951311588287, "learning_rate": 9.361443171537254e-07, "loss": 0.0338, "step": 2518 }, { "epoch": 1.8897224306076519, "grad_norm": 0.18427728116512299, "learning_rate": 9.235754162889021e-07, "loss": 0.0512, "step": 2519 }, { "epoch": 1.8904726181545386, "grad_norm": 0.2464424967765808, "learning_rate": 9.110906766595872e-07, "loss": 0.0512, "step": 2520 }, { "epoch": 1.8912228057014253, "grad_norm": 0.27707773447036743, "learning_rate": 8.986901196759046e-07, "loss": 0.0741, "step": 2521 }, { "epoch": 1.891972993248312, "grad_norm": 0.09056626260280609, "learning_rate": 8.863737666035765e-07, "loss": 0.0228, "step": 2522 }, { "epoch": 1.8927231807951987, "grad_norm": 0.25095298886299133, "learning_rate": 8.741416385639412e-07, "loss": 0.0811, "step": 2523 }, { "epoch": 1.8934733683420855, "grad_norm": 0.1794692426919937, "learning_rate": 8.619937565338854e-07, "loss": 0.0416, "step": 2524 }, { "epoch": 1.8942235558889724, "grad_norm": 0.16589999198913574, "learning_rate": 8.499301413458338e-07, "loss": 0.0418, "step": 2525 }, { "epoch": 1.894973743435859, "grad_norm": 0.2033083289861679, "learning_rate": 8.37950813687699e-07, "loss": 0.0453, "step": 2526 }, { "epoch": 1.8957239309827458, "grad_norm": 0.24125005304813385, "learning_rate": 8.26055794102848e-07, "loss": 0.0712, "step": 2527 }, { "epoch": 1.8964741185296323, "grad_norm": 0.1965634971857071, "learning_rate": 8.142451029900744e-07, "loss": 0.0377, "step": 2528 }, { "epoch": 1.8972243060765193, "grad_norm": 0.1771164983510971, "learning_rate": 8.025187606035434e-07, "loss": 0.0465, "step": 2529 }, { "epoch": 1.8979744936234058, "grad_norm": 0.2019067406654358, "learning_rate": 7.908767870527745e-07, "loss": 0.049, "step": 2530 }, { "epoch": 1.8987246811702927, "grad_norm": 0.1835569441318512, "learning_rate": 7.793192023026142e-07, "loss": 0.038, "step": 2531 }, { "epoch": 1.8994748687171792, "grad_norm": 0.2343943566083908, "learning_rate": 7.678460261731801e-07, "loss": 0.0752, "step": 2532 }, { "epoch": 1.9002250562640661, "grad_norm": 0.2090568095445633, "learning_rate": 7.564572783398339e-07, "loss": 0.0338, "step": 2533 }, { "epoch": 1.9009752438109526, "grad_norm": 0.18029850721359253, "learning_rate": 7.451529783331523e-07, "loss": 0.0318, "step": 2534 }, { "epoch": 1.9017254313578396, "grad_norm": 0.2427482008934021, "learning_rate": 7.339331455389175e-07, "loss": 0.0441, "step": 2535 }, { "epoch": 1.902475618904726, "grad_norm": 0.1632808893918991, "learning_rate": 7.227977991980217e-07, "loss": 0.0463, "step": 2536 }, { "epoch": 1.903225806451613, "grad_norm": 0.1503705233335495, "learning_rate": 7.117469584064895e-07, "loss": 0.0347, "step": 2537 }, { "epoch": 1.9039759939984995, "grad_norm": 0.19075416028499603, "learning_rate": 7.007806421154284e-07, "loss": 0.0268, "step": 2538 }, { "epoch": 1.9047261815453864, "grad_norm": 0.18026676774024963, "learning_rate": 6.898988691309893e-07, "loss": 0.0393, "step": 2539 }, { "epoch": 1.905476369092273, "grad_norm": 0.14918452501296997, "learning_rate": 6.791016581143395e-07, "loss": 0.0342, "step": 2540 }, { "epoch": 1.9062265566391599, "grad_norm": 0.2030014842748642, "learning_rate": 6.683890275816341e-07, "loss": 0.0436, "step": 2541 }, { "epoch": 1.9069767441860463, "grad_norm": 0.23770801723003387, "learning_rate": 6.577609959039776e-07, "loss": 0.0575, "step": 2542 }, { "epoch": 1.9077269317329333, "grad_norm": 0.2590319812297821, "learning_rate": 6.472175813074022e-07, "loss": 0.0656, "step": 2543 }, { "epoch": 1.90847711927982, "grad_norm": 0.14560408890247345, "learning_rate": 6.367588018728166e-07, "loss": 0.0402, "step": 2544 }, { "epoch": 1.9092273068267067, "grad_norm": 0.14348311722278595, "learning_rate": 6.263846755360126e-07, "loss": 0.0282, "step": 2545 }, { "epoch": 1.9099774943735934, "grad_norm": 0.1561354547739029, "learning_rate": 6.16095220087587e-07, "loss": 0.0336, "step": 2546 }, { "epoch": 1.9107276819204801, "grad_norm": 0.14579859375953674, "learning_rate": 6.05890453172936e-07, "loss": 0.0324, "step": 2547 }, { "epoch": 1.9114778694673669, "grad_norm": 0.20945526659488678, "learning_rate": 5.957703922922386e-07, "loss": 0.0547, "step": 2548 }, { "epoch": 1.9122280570142536, "grad_norm": 0.14774468541145325, "learning_rate": 5.857350548004015e-07, "loss": 0.0306, "step": 2549 }, { "epoch": 1.9129782445611403, "grad_norm": 0.1546287089586258, "learning_rate": 5.757844579070359e-07, "loss": 0.0278, "step": 2550 }, { "epoch": 1.913728432108027, "grad_norm": 0.13874396681785583, "learning_rate": 5.65918618676442e-07, "loss": 0.0287, "step": 2551 }, { "epoch": 1.9144786196549137, "grad_norm": 0.26407912373542786, "learning_rate": 5.561375540275581e-07, "loss": 0.0651, "step": 2552 }, { "epoch": 1.9152288072018004, "grad_norm": 0.19224029779434204, "learning_rate": 5.464412807339558e-07, "loss": 0.0499, "step": 2553 }, { "epoch": 1.9159789947486872, "grad_norm": 0.20610445737838745, "learning_rate": 5.368298154237727e-07, "loss": 0.0553, "step": 2554 }, { "epoch": 1.9167291822955739, "grad_norm": 0.19327408075332642, "learning_rate": 5.273031745797352e-07, "loss": 0.0324, "step": 2555 }, { "epoch": 1.9174793698424606, "grad_norm": 0.17420758306980133, "learning_rate": 5.17861374539097e-07, "loss": 0.0289, "step": 2556 }, { "epoch": 1.9182295573893473, "grad_norm": 0.2689003646373749, "learning_rate": 5.085044314936116e-07, "loss": 0.0423, "step": 2557 }, { "epoch": 1.918979744936234, "grad_norm": 0.1577097475528717, "learning_rate": 4.992323614895156e-07, "loss": 0.049, "step": 2558 }, { "epoch": 1.9197299324831207, "grad_norm": 0.222988560795784, "learning_rate": 4.900451804274898e-07, "loss": 0.0452, "step": 2559 }, { "epoch": 1.9204801200300075, "grad_norm": 0.20198635756969452, "learning_rate": 4.809429040626535e-07, "loss": 0.0437, "step": 2560 }, { "epoch": 1.9212303075768942, "grad_norm": 0.12086933106184006, "learning_rate": 4.719255480045148e-07, "loss": 0.0269, "step": 2561 }, { "epoch": 1.921980495123781, "grad_norm": 0.12228506803512573, "learning_rate": 4.6299312771694304e-07, "loss": 0.031, "step": 2562 }, { "epoch": 1.9227306826706676, "grad_norm": 0.2840142250061035, "learning_rate": 4.5414565851816806e-07, "loss": 0.0577, "step": 2563 }, { "epoch": 1.9234808702175545, "grad_norm": 0.19320012629032135, "learning_rate": 4.453831555807253e-07, "loss": 0.0426, "step": 2564 }, { "epoch": 1.924231057764441, "grad_norm": 0.14760589599609375, "learning_rate": 4.36705633931439e-07, "loss": 0.0313, "step": 2565 }, { "epoch": 1.924981245311328, "grad_norm": 0.18715044856071472, "learning_rate": 4.281131084514167e-07, "loss": 0.0486, "step": 2566 }, { "epoch": 1.9257314328582145, "grad_norm": 0.2232694923877716, "learning_rate": 4.196055938759824e-07, "loss": 0.0533, "step": 2567 }, { "epoch": 1.9264816204051014, "grad_norm": 0.21138830482959747, "learning_rate": 4.111831047946879e-07, "loss": 0.0425, "step": 2568 }, { "epoch": 1.927231807951988, "grad_norm": 0.29661694169044495, "learning_rate": 4.0284565565127384e-07, "loss": 0.0364, "step": 2569 }, { "epoch": 1.9279819954988748, "grad_norm": 0.23025383055210114, "learning_rate": 3.9459326074364756e-07, "loss": 0.0547, "step": 2570 }, { "epoch": 1.9287321830457613, "grad_norm": 0.13459983468055725, "learning_rate": 3.8642593422384965e-07, "loss": 0.032, "step": 2571 }, { "epoch": 1.9294823705926483, "grad_norm": 0.1266406923532486, "learning_rate": 3.7834369009804303e-07, "loss": 0.0324, "step": 2572 }, { "epoch": 1.9302325581395348, "grad_norm": 0.1843937188386917, "learning_rate": 3.703465422264796e-07, "loss": 0.0391, "step": 2573 }, { "epoch": 1.9309827456864217, "grad_norm": 0.15218639373779297, "learning_rate": 3.624345043234778e-07, "loss": 0.0339, "step": 2574 }, { "epoch": 1.9317329332333082, "grad_norm": 0.1591964066028595, "learning_rate": 3.5460758995741194e-07, "loss": 0.0285, "step": 2575 }, { "epoch": 1.9324831207801951, "grad_norm": 0.23621922731399536, "learning_rate": 3.468658125506563e-07, "loss": 0.0491, "step": 2576 }, { "epoch": 1.9332333083270816, "grad_norm": 0.14313896000385284, "learning_rate": 3.3920918537960754e-07, "loss": 0.0282, "step": 2577 }, { "epoch": 1.9339834958739686, "grad_norm": 0.18378767371177673, "learning_rate": 3.3163772157462357e-07, "loss": 0.0484, "step": 2578 }, { "epoch": 1.934733683420855, "grad_norm": 0.2192428708076477, "learning_rate": 3.241514341200236e-07, "loss": 0.0731, "step": 2579 }, { "epoch": 1.935483870967742, "grad_norm": 0.14537541568279266, "learning_rate": 3.1675033585404355e-07, "loss": 0.0304, "step": 2580 }, { "epoch": 1.9362340585146287, "grad_norm": 0.23168228566646576, "learning_rate": 3.0943443946884755e-07, "loss": 0.055, "step": 2581 }, { "epoch": 1.9369842460615154, "grad_norm": 0.15877023339271545, "learning_rate": 3.0220375751047194e-07, "loss": 0.0348, "step": 2582 }, { "epoch": 1.9377344336084021, "grad_norm": 0.21073149144649506, "learning_rate": 2.950583023788256e-07, "loss": 0.0505, "step": 2583 }, { "epoch": 1.9384846211552889, "grad_norm": 0.18246489763259888, "learning_rate": 2.879980863276621e-07, "loss": 0.0394, "step": 2584 }, { "epoch": 1.9392348087021756, "grad_norm": 0.15483985841274261, "learning_rate": 2.8102312146455755e-07, "loss": 0.0373, "step": 2585 }, { "epoch": 1.9399849962490623, "grad_norm": 0.21077927947044373, "learning_rate": 2.7413341975088824e-07, "loss": 0.0482, "step": 2586 }, { "epoch": 1.940735183795949, "grad_norm": 0.16964302957057953, "learning_rate": 2.6732899300180857e-07, "loss": 0.0316, "step": 2587 }, { "epoch": 1.9414853713428357, "grad_norm": 0.18473991751670837, "learning_rate": 2.606098528862566e-07, "loss": 0.0476, "step": 2588 }, { "epoch": 1.9422355588897224, "grad_norm": 0.1389751136302948, "learning_rate": 2.5397601092687627e-07, "loss": 0.0484, "step": 2589 }, { "epoch": 1.9429857464366092, "grad_norm": 0.1501632183790207, "learning_rate": 2.474274785000619e-07, "loss": 0.0339, "step": 2590 }, { "epoch": 1.9437359339834959, "grad_norm": 0.17686618864536285, "learning_rate": 2.40964266835908e-07, "loss": 0.0554, "step": 2591 }, { "epoch": 1.9444861215303826, "grad_norm": 0.13381214439868927, "learning_rate": 2.3458638701817636e-07, "loss": 0.0356, "step": 2592 }, { "epoch": 1.9452363090772693, "grad_norm": 0.15234093368053436, "learning_rate": 2.2829384998430681e-07, "loss": 0.034, "step": 2593 }, { "epoch": 1.945986496624156, "grad_norm": 0.1863105297088623, "learning_rate": 2.2208666652537846e-07, "loss": 0.0522, "step": 2594 }, { "epoch": 1.9467366841710427, "grad_norm": 0.1748838722705841, "learning_rate": 2.1596484728610421e-07, "loss": 0.0573, "step": 2595 }, { "epoch": 1.9474868717179294, "grad_norm": 0.23012542724609375, "learning_rate": 2.099284027647974e-07, "loss": 0.0449, "step": 2596 }, { "epoch": 1.9482370592648162, "grad_norm": 0.23254768550395966, "learning_rate": 2.039773433133718e-07, "loss": 0.0359, "step": 2597 }, { "epoch": 1.9489872468117029, "grad_norm": 0.24744318425655365, "learning_rate": 1.9811167913729723e-07, "loss": 0.0441, "step": 2598 }, { "epoch": 1.9497374343585896, "grad_norm": 0.12150046974420547, "learning_rate": 1.923314202956217e-07, "loss": 0.0217, "step": 2599 }, { "epoch": 1.9504876219054763, "grad_norm": 0.18914827704429626, "learning_rate": 1.8663657670091595e-07, "loss": 0.0489, "step": 2600 }, { "epoch": 1.9504876219054763, "eval_loss": 0.06815290451049805, "eval_runtime": 2.6569, "eval_samples_per_second": 20.324, "eval_steps_per_second": 5.269, "step": 2600 }, { "epoch": 1.9512378094523632, "grad_norm": 0.13075482845306396, "learning_rate": 1.810271581192735e-07, "loss": 0.026, "step": 2601 }, { "epoch": 1.9519879969992497, "grad_norm": 0.15196098387241364, "learning_rate": 1.755031741702995e-07, "loss": 0.0361, "step": 2602 }, { "epoch": 1.9527381845461367, "grad_norm": 0.19363312423229218, "learning_rate": 1.7006463432707177e-07, "loss": 0.0398, "step": 2603 }, { "epoch": 1.9534883720930232, "grad_norm": 0.13908173143863678, "learning_rate": 1.6471154791616317e-07, "loss": 0.0375, "step": 2604 }, { "epoch": 1.9542385596399101, "grad_norm": 0.15720535814762115, "learning_rate": 1.59443924117586e-07, "loss": 0.031, "step": 2605 }, { "epoch": 1.9549887471867966, "grad_norm": 0.1469358503818512, "learning_rate": 1.5426177196479207e-07, "loss": 0.0327, "step": 2606 }, { "epoch": 1.9557389347336835, "grad_norm": 0.20019660890102386, "learning_rate": 1.4916510034466702e-07, "loss": 0.073, "step": 2607 }, { "epoch": 1.95648912228057, "grad_norm": 0.15746329724788666, "learning_rate": 1.441539179974971e-07, "loss": 0.0534, "step": 2608 }, { "epoch": 1.957239309827457, "grad_norm": 0.20289811491966248, "learning_rate": 1.3922823351697479e-07, "loss": 0.0331, "step": 2609 }, { "epoch": 1.9579894973743435, "grad_norm": 0.2068672776222229, "learning_rate": 1.343880553501542e-07, "loss": 0.0318, "step": 2610 }, { "epoch": 1.9587396849212304, "grad_norm": 0.18808381259441376, "learning_rate": 1.2963339179746238e-07, "loss": 0.0378, "step": 2611 }, { "epoch": 1.959489872468117, "grad_norm": 0.20876070857048035, "learning_rate": 1.2496425101268804e-07, "loss": 0.064, "step": 2612 }, { "epoch": 1.9602400600150038, "grad_norm": 0.1189245879650116, "learning_rate": 1.2038064100294843e-07, "loss": 0.0237, "step": 2613 }, { "epoch": 1.9609902475618903, "grad_norm": 0.19520774483680725, "learning_rate": 1.158825696286725e-07, "loss": 0.0393, "step": 2614 }, { "epoch": 1.9617404351087773, "grad_norm": 0.1765730082988739, "learning_rate": 1.114700446036232e-07, "loss": 0.0356, "step": 2615 }, { "epoch": 1.9624906226556638, "grad_norm": 0.119378462433815, "learning_rate": 1.0714307349483089e-07, "loss": 0.031, "step": 2616 }, { "epoch": 1.9632408102025507, "grad_norm": 0.15803013741970062, "learning_rate": 1.029016637226432e-07, "loss": 0.0343, "step": 2617 }, { "epoch": 1.9639909977494372, "grad_norm": 0.1664055436849594, "learning_rate": 9.874582256064192e-08, "loss": 0.0353, "step": 2618 }, { "epoch": 1.9647411852963241, "grad_norm": 0.20048417150974274, "learning_rate": 9.46755571356983e-08, "loss": 0.0374, "step": 2619 }, { "epoch": 1.9654913728432108, "grad_norm": 0.21543757617473602, "learning_rate": 9.069087442791224e-08, "loss": 0.0706, "step": 2620 }, { "epoch": 1.9662415603900976, "grad_norm": 0.2072872519493103, "learning_rate": 8.679178127062871e-08, "loss": 0.0635, "step": 2621 }, { "epoch": 1.9669917479369843, "grad_norm": 0.19162490963935852, "learning_rate": 8.297828435039346e-08, "loss": 0.0351, "step": 2622 }, { "epoch": 1.967741935483871, "grad_norm": 0.17158183455467224, "learning_rate": 7.925039020699187e-08, "loss": 0.0377, "step": 2623 }, { "epoch": 1.9684921230307577, "grad_norm": 0.2609858810901642, "learning_rate": 7.56081052333879e-08, "loss": 0.0788, "step": 2624 }, { "epoch": 1.9692423105776444, "grad_norm": 0.19607380032539368, "learning_rate": 7.205143567574624e-08, "loss": 0.0528, "step": 2625 }, { "epoch": 1.9699924981245311, "grad_norm": 0.24832478165626526, "learning_rate": 6.858038763340458e-08, "loss": 0.0519, "step": 2626 }, { "epoch": 1.9707426856714179, "grad_norm": 0.24293671548366547, "learning_rate": 6.519496705886252e-08, "loss": 0.0559, "step": 2627 }, { "epoch": 1.9714928732183046, "grad_norm": 0.16258803009986877, "learning_rate": 6.189517975778713e-08, "loss": 0.0385, "step": 2628 }, { "epoch": 1.9722430607651913, "grad_norm": 0.2014719396829605, "learning_rate": 5.8681031388990724e-08, "loss": 0.0331, "step": 2629 }, { "epoch": 1.972993248312078, "grad_norm": 0.170815110206604, "learning_rate": 5.555252746441975e-08, "loss": 0.0476, "step": 2630 }, { "epoch": 1.9737434358589647, "grad_norm": 0.16516125202178955, "learning_rate": 5.25096733491548e-08, "loss": 0.0373, "step": 2631 }, { "epoch": 1.9744936234058514, "grad_norm": 0.1762745976448059, "learning_rate": 4.9552474261377326e-08, "loss": 0.0372, "step": 2632 }, { "epoch": 1.9752438109527382, "grad_norm": 0.1631755232810974, "learning_rate": 4.6680935272408465e-08, "loss": 0.0271, "step": 2633 }, { "epoch": 1.9759939984996249, "grad_norm": 0.16810788214206696, "learning_rate": 4.3895061306648e-08, "loss": 0.0458, "step": 2634 }, { "epoch": 1.9767441860465116, "grad_norm": 0.2130126953125, "learning_rate": 4.119485714159099e-08, "loss": 0.0483, "step": 2635 }, { "epoch": 1.9774943735933983, "grad_norm": 0.18357796967029572, "learning_rate": 3.8580327407827796e-08, "loss": 0.0476, "step": 2636 }, { "epoch": 1.978244561140285, "grad_norm": 0.23056019842624664, "learning_rate": 3.605147658901631e-08, "loss": 0.0403, "step": 2637 }, { "epoch": 1.978994748687172, "grad_norm": 0.24155446887016296, "learning_rate": 3.360830902189305e-08, "loss": 0.0845, "step": 2638 }, { "epoch": 1.9797449362340584, "grad_norm": 0.2481115311384201, "learning_rate": 3.125082889623987e-08, "loss": 0.0467, "step": 2639 }, { "epoch": 1.9804951237809454, "grad_norm": 0.3033842444419861, "learning_rate": 2.8979040254911717e-08, "loss": 0.0677, "step": 2640 }, { "epoch": 1.9812453113278319, "grad_norm": 0.1688585877418518, "learning_rate": 2.67929469937922e-08, "loss": 0.0429, "step": 2641 }, { "epoch": 1.9819954988747188, "grad_norm": 0.16313163936138153, "learning_rate": 2.4692552861826925e-08, "loss": 0.0346, "step": 2642 }, { "epoch": 1.9827456864216053, "grad_norm": 0.19703680276870728, "learning_rate": 2.2677861460984607e-08, "loss": 0.0381, "step": 2643 }, { "epoch": 1.9834958739684923, "grad_norm": 0.17000724375247955, "learning_rate": 2.074887624625155e-08, "loss": 0.0499, "step": 2644 }, { "epoch": 1.9842460615153787, "grad_norm": 0.15490411221981049, "learning_rate": 1.890560052565937e-08, "loss": 0.0384, "step": 2645 }, { "epoch": 1.9849962490622657, "grad_norm": 0.14818815886974335, "learning_rate": 1.7148037460235078e-08, "loss": 0.032, "step": 2646 }, { "epoch": 1.9857464366091522, "grad_norm": 0.29421237111091614, "learning_rate": 1.5476190064034334e-08, "loss": 0.0621, "step": 2647 }, { "epoch": 1.9864966241560391, "grad_norm": 0.15614813566207886, "learning_rate": 1.3890061204108185e-08, "loss": 0.0451, "step": 2648 }, { "epoch": 1.9872468117029256, "grad_norm": 0.24815501272678375, "learning_rate": 1.2389653600508588e-08, "loss": 0.0516, "step": 2649 }, { "epoch": 1.9879969992498125, "grad_norm": 0.2233879268169403, "learning_rate": 1.0974969826288428e-08, "loss": 0.0523, "step": 2650 }, { "epoch": 1.988747186796699, "grad_norm": 0.2333759218454361, "learning_rate": 9.646012307490405e-09, "loss": 0.0365, "step": 2651 }, { "epoch": 1.989497374343586, "grad_norm": 0.32244834303855896, "learning_rate": 8.402783323147034e-09, "loss": 0.0595, "step": 2652 }, { "epoch": 1.9902475618904725, "grad_norm": 0.21089965105056763, "learning_rate": 7.245285005275104e-09, "loss": 0.0433, "step": 2653 }, { "epoch": 1.9909977494373594, "grad_norm": 0.14350849390029907, "learning_rate": 6.1735193388701155e-09, "loss": 0.0364, "step": 2654 }, { "epoch": 1.991747936984246, "grad_norm": 0.1588405966758728, "learning_rate": 5.187488161895182e-09, "loss": 0.0222, "step": 2655 }, { "epoch": 1.9924981245311328, "grad_norm": 0.24085837602615356, "learning_rate": 4.28719316531434e-09, "loss": 0.0548, "step": 2656 }, { "epoch": 1.9932483120780196, "grad_norm": 0.18003256618976593, "learning_rate": 3.4726358930259328e-09, "loss": 0.051, "step": 2657 }, { "epoch": 1.9939984996249063, "grad_norm": 0.1899539977312088, "learning_rate": 2.743817741929222e-09, "loss": 0.0621, "step": 2658 }, { "epoch": 1.994748687171793, "grad_norm": 0.24146965146064758, "learning_rate": 2.1007399618688807e-09, "loss": 0.0355, "step": 2659 }, { "epoch": 1.9954988747186797, "grad_norm": 0.17824961245059967, "learning_rate": 1.543403655662745e-09, "loss": 0.0399, "step": 2660 }, { "epoch": 1.9962490622655664, "grad_norm": 0.18570539355278015, "learning_rate": 1.0718097790907156e-09, "loss": 0.0383, "step": 2661 }, { "epoch": 1.9969992498124531, "grad_norm": 0.12949424982070923, "learning_rate": 6.859591408836519e-10, "loss": 0.0241, "step": 2662 }, { "epoch": 1.9977494373593399, "grad_norm": 0.22268816828727722, "learning_rate": 3.8585240273447677e-10, "loss": 0.0601, "step": 2663 }, { "epoch": 1.9984996249062266, "grad_norm": 0.1774711310863495, "learning_rate": 1.7149007930927773e-10, "loss": 0.0538, "step": 2664 }, { "epoch": 1.9992498124531133, "grad_norm": 0.1890869289636612, "learning_rate": 4.2872538208449386e-11, "loss": 0.0301, "step": 2665 }, { "epoch": 2.0, "grad_norm": 0.1974925547838211, "learning_rate": 0.0, "loss": 0.051, "step": 2666 }, { "epoch": 2.0, "step": 2666, "total_flos": 4.5995952202737254e+17, "train_loss": 0.05864244963993323, "train_runtime": 1863.1592, "train_samples_per_second": 5.72, "train_steps_per_second": 1.431 } ], "logging_steps": 1, "max_steps": 2666, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.5995952202737254e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }