diff --git "a/checkpoint-1096/trainer_state.json" "b/checkpoint-1096/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1096/trainer_state.json" @@ -0,0 +1,7765 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9858705560619874, + "eval_steps": 137, + "global_step": 1096, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.07610916346311569, + "learning_rate": 2e-05, + "loss": 1.795, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 1.8087825775146484, + "eval_runtime": 75.9539, + "eval_samples_per_second": 65.829, + "eval_steps_per_second": 16.457, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.0771929994225502, + "learning_rate": 4e-05, + "loss": 1.7825, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 0.08941341191530228, + "learning_rate": 6e-05, + "loss": 1.7737, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 0.08335491269826889, + "learning_rate": 8e-05, + "loss": 1.8004, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.08835520595312119, + "learning_rate": 0.0001, + "loss": 1.8495, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.08816578984260559, + "learning_rate": 0.00012, + "loss": 1.7758, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.09536299854516983, + "learning_rate": 0.00014, + "loss": 1.8001, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 0.07634323835372925, + "learning_rate": 0.00016, + "loss": 1.7022, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 0.06886536628007889, + "learning_rate": 0.00018, + "loss": 1.8428, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.07389801740646362, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.06829163432121277, + "learning_rate": 0.00019999981517295864, + "loss": 1.7479, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.060045819729566574, + "learning_rate": 0.0001999992606925178, + "loss": 1.7454, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.08187604695558548, + "learning_rate": 0.0001999983365607271, + "loss": 1.7679, + "step": 13 + }, + { + "epoch": 0.03, + "grad_norm": 0.05995490401983261, + "learning_rate": 0.00019999704278100263, + "loss": 1.7599, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.055336710065603256, + "learning_rate": 0.00019999537935812698, + "loss": 1.8244, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.0541992112994194, + "learning_rate": 0.00019999334629824895, + "loss": 1.7756, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.05088195204734802, + "learning_rate": 0.00019999094360888392, + "loss": 1.7352, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 0.05157861113548279, + "learning_rate": 0.00019998817129891346, + "loss": 1.7634, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.055710840970277786, + "learning_rate": 0.00019998502937858557, + "loss": 1.7802, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 0.055150121450424194, + "learning_rate": 0.00019998151785951448, + "loss": 1.7445, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.0526655912399292, + "learning_rate": 0.0001999776367546806, + "loss": 1.6634, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.04809674620628357, + "learning_rate": 0.00019997338607843075, + "loss": 1.7277, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.049412671476602554, + "learning_rate": 0.00019996876584647754, + "loss": 1.7357, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.04948608949780464, + "learning_rate": 0.00019996377607589997, + "loss": 1.7323, + "step": 24 + }, + { + "epoch": 0.05, + "grad_norm": 0.050225820392370224, + "learning_rate": 0.00019995841678514294, + "loss": 1.7273, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 0.05085042864084244, + "learning_rate": 0.00019995268799401718, + "loss": 1.7564, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.04916631057858467, + "learning_rate": 0.00019994658972369948, + "loss": 1.7439, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.04791415110230446, + "learning_rate": 0.00019994012199673234, + "loss": 1.6813, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 0.04975065216422081, + "learning_rate": 0.00019993328483702393, + "loss": 1.691, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.055913638323545456, + "learning_rate": 0.00019992607826984816, + "loss": 1.7242, + "step": 30 + }, + { + "epoch": 0.06, + "grad_norm": 0.045829374343156815, + "learning_rate": 0.00019991850232184435, + "loss": 1.7334, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 0.053105831146240234, + "learning_rate": 0.00019991055702101734, + "loss": 1.7214, + "step": 32 + }, + { + "epoch": 0.06, + "grad_norm": 0.04539350047707558, + "learning_rate": 0.00019990224239673722, + "loss": 1.7698, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.046983517706394196, + "learning_rate": 0.00019989355847973932, + "loss": 1.6887, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 0.0471692830324173, + "learning_rate": 0.00019988450530212414, + "loss": 1.7571, + "step": 35 + }, + { + "epoch": 0.07, + "grad_norm": 0.046874694526195526, + "learning_rate": 0.00019987508289735716, + "loss": 1.7558, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.04474163055419922, + "learning_rate": 0.00019986529130026857, + "loss": 1.7465, + "step": 37 + }, + { + "epoch": 0.07, + "grad_norm": 0.044651810079813004, + "learning_rate": 0.00019985513054705348, + "loss": 1.6983, + "step": 38 + }, + { + "epoch": 0.07, + "grad_norm": 0.04951983690261841, + "learning_rate": 0.00019984460067527153, + "loss": 1.761, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 0.04424133151769638, + "learning_rate": 0.00019983370172384682, + "loss": 1.6383, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 0.052418872714042664, + "learning_rate": 0.00019982243373306772, + "loss": 1.779, + "step": 41 + }, + { + "epoch": 0.08, + "grad_norm": 0.04530750587582588, + "learning_rate": 0.0001998107967445869, + "loss": 1.6942, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.04790988191962242, + "learning_rate": 0.0001997987908014209, + "loss": 1.7053, + "step": 43 + }, + { + "epoch": 0.08, + "grad_norm": 0.04889607056975365, + "learning_rate": 0.0001997864159479502, + "loss": 1.7275, + "step": 44 + }, + { + "epoch": 0.08, + "grad_norm": 0.04314807429909706, + "learning_rate": 0.00019977367222991893, + "loss": 1.7393, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.04405505582690239, + "learning_rate": 0.00019976055969443479, + "loss": 1.7306, + "step": 46 + }, + { + "epoch": 0.09, + "grad_norm": 0.04656574875116348, + "learning_rate": 0.00019974707838996882, + "loss": 1.7686, + "step": 47 + }, + { + "epoch": 0.09, + "grad_norm": 0.04246290400624275, + "learning_rate": 0.00019973322836635518, + "loss": 1.7209, + "step": 48 + }, + { + "epoch": 0.09, + "grad_norm": 0.05493748560547829, + "learning_rate": 0.00019971900967479106, + "loss": 1.7155, + "step": 49 + }, + { + "epoch": 0.09, + "grad_norm": 0.0450466088950634, + "learning_rate": 0.0001997044223678364, + "loss": 1.6604, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 0.08634985238313675, + "learning_rate": 0.00019968946649941382, + "loss": 1.7321, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.04310084879398346, + "learning_rate": 0.00019967414212480831, + "loss": 1.7281, + "step": 52 + }, + { + "epoch": 0.1, + "grad_norm": 0.04666193947196007, + "learning_rate": 0.000199658449300667, + "loss": 1.6787, + "step": 53 + }, + { + "epoch": 0.1, + "grad_norm": 0.04957772046327591, + "learning_rate": 0.00019964238808499907, + "loss": 1.6919, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.0421697273850441, + "learning_rate": 0.00019962595853717548, + "loss": 1.7245, + "step": 55 + }, + { + "epoch": 0.1, + "grad_norm": 0.04654068127274513, + "learning_rate": 0.0001996091607179287, + "loss": 1.7123, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 0.04076274484395981, + "learning_rate": 0.00019959199468935258, + "loss": 1.7066, + "step": 57 + }, + { + "epoch": 0.11, + "grad_norm": 0.04215634986758232, + "learning_rate": 0.00019957446051490198, + "loss": 1.7748, + "step": 58 + }, + { + "epoch": 0.11, + "grad_norm": 0.04252045601606369, + "learning_rate": 0.0001995565582593928, + "loss": 1.7396, + "step": 59 + }, + { + "epoch": 0.11, + "grad_norm": 0.04455842077732086, + "learning_rate": 0.00019953828798900135, + "loss": 1.7236, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 0.044083647429943085, + "learning_rate": 0.0001995196497712645, + "loss": 1.7416, + "step": 61 + }, + { + "epoch": 0.11, + "grad_norm": 0.04511955380439758, + "learning_rate": 0.00019950064367507916, + "loss": 1.7481, + "step": 62 + }, + { + "epoch": 0.11, + "grad_norm": 0.0424315445125103, + "learning_rate": 0.00019948126977070217, + "loss": 1.7712, + "step": 63 + }, + { + "epoch": 0.12, + "grad_norm": 0.04309271275997162, + "learning_rate": 0.00019946152812974993, + "loss": 1.6927, + "step": 64 + }, + { + "epoch": 0.12, + "grad_norm": 0.042915165424346924, + "learning_rate": 0.00019944141882519817, + "loss": 1.7465, + "step": 65 + }, + { + "epoch": 0.12, + "grad_norm": 0.05950941890478134, + "learning_rate": 0.00019942094193138186, + "loss": 1.7035, + "step": 66 + }, + { + "epoch": 0.12, + "grad_norm": 0.042048510164022446, + "learning_rate": 0.0001994000975239946, + "loss": 1.7521, + "step": 67 + }, + { + "epoch": 0.12, + "grad_norm": 0.041577938944101334, + "learning_rate": 0.00019937888568008862, + "loss": 1.7439, + "step": 68 + }, + { + "epoch": 0.13, + "grad_norm": 0.04538682475686073, + "learning_rate": 0.00019935730647807436, + "loss": 1.7528, + "step": 69 + }, + { + "epoch": 0.13, + "grad_norm": 0.04102981090545654, + "learning_rate": 0.00019933535999772025, + "loss": 1.6828, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 0.04318905994296074, + "learning_rate": 0.00019931304632015228, + "loss": 1.7532, + "step": 71 + }, + { + "epoch": 0.13, + "grad_norm": 0.043007493019104004, + "learning_rate": 0.00019929036552785397, + "loss": 1.7353, + "step": 72 + }, + { + "epoch": 0.13, + "grad_norm": 0.04308176040649414, + "learning_rate": 0.00019926731770466568, + "loss": 1.6882, + "step": 73 + }, + { + "epoch": 0.13, + "grad_norm": 0.04227353632450104, + "learning_rate": 0.00019924390293578472, + "loss": 1.7302, + "step": 74 + }, + { + "epoch": 0.14, + "grad_norm": 0.0429629310965538, + "learning_rate": 0.0001992201213077647, + "loss": 1.6822, + "step": 75 + }, + { + "epoch": 0.14, + "grad_norm": 0.042203355580568314, + "learning_rate": 0.00019919597290851538, + "loss": 1.7601, + "step": 76 + }, + { + "epoch": 0.14, + "grad_norm": 0.04265713319182396, + "learning_rate": 0.00019917145782730232, + "loss": 1.7725, + "step": 77 + }, + { + "epoch": 0.14, + "grad_norm": 0.04848012328147888, + "learning_rate": 0.00019914657615474653, + "loss": 1.7587, + "step": 78 + }, + { + "epoch": 0.14, + "grad_norm": 0.042650256305933, + "learning_rate": 0.00019912132798282408, + "loss": 1.7422, + "step": 79 + }, + { + "epoch": 0.15, + "grad_norm": 0.04107372462749481, + "learning_rate": 0.00019909571340486593, + "loss": 1.7059, + "step": 80 + }, + { + "epoch": 0.15, + "grad_norm": 0.04788720980286598, + "learning_rate": 0.00019906973251555734, + "loss": 1.7205, + "step": 81 + }, + { + "epoch": 0.15, + "grad_norm": 0.041231803596019745, + "learning_rate": 0.0001990433854109378, + "loss": 1.7277, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 0.04246293380856514, + "learning_rate": 0.0001990166721884004, + "loss": 1.7739, + "step": 83 + }, + { + "epoch": 0.15, + "grad_norm": 0.04331424832344055, + "learning_rate": 0.00019898959294669167, + "loss": 1.6913, + "step": 84 + }, + { + "epoch": 0.15, + "grad_norm": 0.04720227047801018, + "learning_rate": 0.00019896214778591115, + "loss": 1.7079, + "step": 85 + }, + { + "epoch": 0.16, + "grad_norm": 0.05255519971251488, + "learning_rate": 0.00019893433680751103, + "loss": 1.7182, + "step": 86 + }, + { + "epoch": 0.16, + "grad_norm": 0.042392294853925705, + "learning_rate": 0.00019890616011429568, + "loss": 1.778, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.043008286505937576, + "learning_rate": 0.0001988776178104214, + "loss": 1.7518, + "step": 88 + }, + { + "epoch": 0.16, + "grad_norm": 0.044135116040706635, + "learning_rate": 0.00019884871000139595, + "loss": 1.7534, + "step": 89 + }, + { + "epoch": 0.16, + "grad_norm": 0.041827455163002014, + "learning_rate": 0.00019881943679407832, + "loss": 1.7291, + "step": 90 + }, + { + "epoch": 0.17, + "grad_norm": 0.05515114963054657, + "learning_rate": 0.00019878979829667803, + "loss": 1.7471, + "step": 91 + }, + { + "epoch": 0.17, + "grad_norm": 0.040826503187417984, + "learning_rate": 0.00019875979461875503, + "loss": 1.6408, + "step": 92 + }, + { + "epoch": 0.17, + "grad_norm": 0.04585504159331322, + "learning_rate": 0.00019872942587121915, + "loss": 1.6874, + "step": 93 + }, + { + "epoch": 0.17, + "grad_norm": 0.04665527120232582, + "learning_rate": 0.00019869869216632968, + "loss": 1.6968, + "step": 94 + }, + { + "epoch": 0.17, + "grad_norm": 0.046703219413757324, + "learning_rate": 0.000198667593617695, + "loss": 1.7401, + "step": 95 + }, + { + "epoch": 0.18, + "grad_norm": 0.04115475341677666, + "learning_rate": 0.00019863613034027224, + "loss": 1.7227, + "step": 96 + }, + { + "epoch": 0.18, + "grad_norm": 0.04217168688774109, + "learning_rate": 0.00019860430245036663, + "loss": 1.7268, + "step": 97 + }, + { + "epoch": 0.18, + "grad_norm": 0.044889383018016815, + "learning_rate": 0.00019857211006563125, + "loss": 1.7006, + "step": 98 + }, + { + "epoch": 0.18, + "grad_norm": 0.04161443933844566, + "learning_rate": 0.00019853955330506663, + "loss": 1.7266, + "step": 99 + }, + { + "epoch": 0.18, + "grad_norm": 0.042708829045295715, + "learning_rate": 0.00019850663228902012, + "loss": 1.7314, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 0.046648308634757996, + "learning_rate": 0.00019847334713918557, + "loss": 1.7362, + "step": 101 + }, + { + "epoch": 0.19, + "grad_norm": 0.04414999857544899, + "learning_rate": 0.00019843969797860294, + "loss": 1.7065, + "step": 102 + }, + { + "epoch": 0.19, + "grad_norm": 0.04574083164334297, + "learning_rate": 0.00019840568493165772, + "loss": 1.7333, + "step": 103 + }, + { + "epoch": 0.19, + "grad_norm": 0.041924796998500824, + "learning_rate": 0.0001983713081240805, + "loss": 1.6517, + "step": 104 + }, + { + "epoch": 0.19, + "grad_norm": 0.04238827899098396, + "learning_rate": 0.00019833656768294662, + "loss": 1.776, + "step": 105 + }, + { + "epoch": 0.19, + "grad_norm": 0.04292167350649834, + "learning_rate": 0.00019830146373667548, + "loss": 1.6601, + "step": 106 + }, + { + "epoch": 0.2, + "grad_norm": 0.0433412566781044, + "learning_rate": 0.00019826599641503025, + "loss": 1.6841, + "step": 107 + }, + { + "epoch": 0.2, + "grad_norm": 0.04201202839612961, + "learning_rate": 0.00019823016584911735, + "loss": 1.764, + "step": 108 + }, + { + "epoch": 0.2, + "grad_norm": 0.04234587028622627, + "learning_rate": 0.00019819397217138595, + "loss": 1.7243, + "step": 109 + }, + { + "epoch": 0.2, + "grad_norm": 0.04268571734428406, + "learning_rate": 0.0001981574155156274, + "loss": 1.7656, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.041506245732307434, + "learning_rate": 0.00019812049601697492, + "loss": 1.6636, + "step": 111 + }, + { + "epoch": 0.2, + "grad_norm": 0.04152766987681389, + "learning_rate": 0.00019808321381190294, + "loss": 1.7478, + "step": 112 + }, + { + "epoch": 0.21, + "grad_norm": 0.041750356554985046, + "learning_rate": 0.00019804556903822663, + "loss": 1.7518, + "step": 113 + }, + { + "epoch": 0.21, + "grad_norm": 0.04935223609209061, + "learning_rate": 0.00019800756183510144, + "loss": 1.7673, + "step": 114 + }, + { + "epoch": 0.21, + "grad_norm": 0.042300984263420105, + "learning_rate": 0.00019796919234302255, + "loss": 1.7753, + "step": 115 + }, + { + "epoch": 0.21, + "grad_norm": 0.04224342852830887, + "learning_rate": 0.00019793046070382437, + "loss": 1.7226, + "step": 116 + }, + { + "epoch": 0.21, + "grad_norm": 0.044274065643548965, + "learning_rate": 0.00019789136706067998, + "loss": 1.7065, + "step": 117 + }, + { + "epoch": 0.22, + "grad_norm": 0.04910755529999733, + "learning_rate": 0.00019785191155810062, + "loss": 1.6387, + "step": 118 + }, + { + "epoch": 0.22, + "grad_norm": 0.04774147644639015, + "learning_rate": 0.00019781209434193515, + "loss": 1.7297, + "step": 119 + }, + { + "epoch": 0.22, + "grad_norm": 0.04416586086153984, + "learning_rate": 0.00019777191555936957, + "loss": 1.8096, + "step": 120 + }, + { + "epoch": 0.22, + "grad_norm": 0.04406105354428291, + "learning_rate": 0.00019773137535892635, + "loss": 1.7629, + "step": 121 + }, + { + "epoch": 0.22, + "grad_norm": 0.043473679572343826, + "learning_rate": 0.00019769047389046402, + "loss": 1.6979, + "step": 122 + }, + { + "epoch": 0.22, + "grad_norm": 0.04570621997117996, + "learning_rate": 0.00019764921130517653, + "loss": 1.7123, + "step": 123 + }, + { + "epoch": 0.23, + "grad_norm": 0.04326749965548515, + "learning_rate": 0.00019760758775559274, + "loss": 1.716, + "step": 124 + }, + { + "epoch": 0.23, + "grad_norm": 0.04397182539105415, + "learning_rate": 0.00019756560339557572, + "loss": 1.73, + "step": 125 + }, + { + "epoch": 0.23, + "grad_norm": 0.04468885809183121, + "learning_rate": 0.00019752325838032244, + "loss": 1.7136, + "step": 126 + }, + { + "epoch": 0.23, + "grad_norm": 0.04554520919919014, + "learning_rate": 0.00019748055286636295, + "loss": 1.7448, + "step": 127 + }, + { + "epoch": 0.23, + "grad_norm": 0.04646708443760872, + "learning_rate": 0.00019743748701155995, + "loss": 1.6956, + "step": 128 + }, + { + "epoch": 0.24, + "grad_norm": 0.042717937380075455, + "learning_rate": 0.00019739406097510812, + "loss": 1.7245, + "step": 129 + }, + { + "epoch": 0.24, + "grad_norm": 0.04367038235068321, + "learning_rate": 0.00019735027491753353, + "loss": 1.7102, + "step": 130 + }, + { + "epoch": 0.24, + "grad_norm": 0.04296841099858284, + "learning_rate": 0.0001973061290006932, + "loss": 1.7163, + "step": 131 + }, + { + "epoch": 0.24, + "grad_norm": 0.043665811419487, + "learning_rate": 0.00019726162338777424, + "loss": 1.7172, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.046134624630212784, + "learning_rate": 0.00019721675824329354, + "loss": 1.7327, + "step": 133 + }, + { + "epoch": 0.24, + "grad_norm": 0.04857848584651947, + "learning_rate": 0.00019717153373309692, + "loss": 1.6647, + "step": 134 + }, + { + "epoch": 0.25, + "grad_norm": 0.047723885625600815, + "learning_rate": 0.00019712595002435861, + "loss": 1.7422, + "step": 135 + }, + { + "epoch": 0.25, + "grad_norm": 0.04413154348731041, + "learning_rate": 0.00019708000728558064, + "loss": 1.6943, + "step": 136 + }, + { + "epoch": 0.25, + "grad_norm": 0.043105412274599075, + "learning_rate": 0.00019703370568659225, + "loss": 1.7519, + "step": 137 + }, + { + "epoch": 0.25, + "eval_loss": 1.7284438610076904, + "eval_runtime": 76.3963, + "eval_samples_per_second": 65.448, + "eval_steps_per_second": 16.362, + "step": 137 + }, + { + "epoch": 0.25, + "grad_norm": 0.04300757125020027, + "learning_rate": 0.00019698704539854918, + "loss": 1.7341, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.043961744755506516, + "learning_rate": 0.00019694002659393305, + "loss": 1.777, + "step": 139 + }, + { + "epoch": 0.26, + "grad_norm": 0.04376057907938957, + "learning_rate": 0.00019689264944655084, + "loss": 1.7403, + "step": 140 + }, + { + "epoch": 0.26, + "grad_norm": 0.04482461139559746, + "learning_rate": 0.00019684491413153411, + "loss": 1.6852, + "step": 141 + }, + { + "epoch": 0.26, + "grad_norm": 0.045192863792181015, + "learning_rate": 0.0001967968208253384, + "loss": 1.7494, + "step": 142 + }, + { + "epoch": 0.26, + "grad_norm": 0.04361759498715401, + "learning_rate": 0.00019674836970574254, + "loss": 1.7331, + "step": 143 + }, + { + "epoch": 0.26, + "grad_norm": 0.04294734448194504, + "learning_rate": 0.0001966995609518481, + "loss": 1.6375, + "step": 144 + }, + { + "epoch": 0.26, + "grad_norm": 0.04528161138296127, + "learning_rate": 0.00019665039474407863, + "loss": 1.746, + "step": 145 + }, + { + "epoch": 0.27, + "grad_norm": 0.04510699212551117, + "learning_rate": 0.00019660087126417906, + "loss": 1.7053, + "step": 146 + }, + { + "epoch": 0.27, + "grad_norm": 0.042807720601558685, + "learning_rate": 0.00019655099069521486, + "loss": 1.6748, + "step": 147 + }, + { + "epoch": 0.27, + "grad_norm": 0.04657953232526779, + "learning_rate": 0.00019650075322157168, + "loss": 1.684, + "step": 148 + }, + { + "epoch": 0.27, + "grad_norm": 0.04593012481927872, + "learning_rate": 0.00019645015902895437, + "loss": 1.7076, + "step": 149 + }, + { + "epoch": 0.27, + "grad_norm": 0.04362139105796814, + "learning_rate": 0.0001963992083043864, + "loss": 1.6773, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 0.04773354157805443, + "learning_rate": 0.00019634790123620926, + "loss": 1.7107, + "step": 151 + }, + { + "epoch": 0.28, + "grad_norm": 0.05423569679260254, + "learning_rate": 0.00019629623801408155, + "loss": 1.7052, + "step": 152 + }, + { + "epoch": 0.28, + "grad_norm": 0.043550509959459305, + "learning_rate": 0.00019624421882897855, + "loss": 1.7151, + "step": 153 + }, + { + "epoch": 0.28, + "grad_norm": 0.04896851256489754, + "learning_rate": 0.00019619184387319123, + "loss": 1.6611, + "step": 154 + }, + { + "epoch": 0.28, + "grad_norm": 0.04392845928668976, + "learning_rate": 0.00019613911334032583, + "loss": 1.738, + "step": 155 + }, + { + "epoch": 0.28, + "grad_norm": 0.04582325741648674, + "learning_rate": 0.00019608602742530283, + "loss": 1.6885, + "step": 156 + }, + { + "epoch": 0.29, + "grad_norm": 0.045696284621953964, + "learning_rate": 0.00019603258632435656, + "loss": 1.7365, + "step": 157 + }, + { + "epoch": 0.29, + "grad_norm": 0.043873440474271774, + "learning_rate": 0.00019597879023503417, + "loss": 1.8094, + "step": 158 + }, + { + "epoch": 0.29, + "grad_norm": 0.05078018456697464, + "learning_rate": 0.00019592463935619517, + "loss": 1.7341, + "step": 159 + }, + { + "epoch": 0.29, + "grad_norm": 0.042483873665332794, + "learning_rate": 0.00019587013388801047, + "loss": 1.7351, + "step": 160 + }, + { + "epoch": 0.29, + "grad_norm": 0.045154914259910583, + "learning_rate": 0.00019581527403196168, + "loss": 1.6645, + "step": 161 + }, + { + "epoch": 0.3, + "grad_norm": 0.04563280567526817, + "learning_rate": 0.0001957600599908406, + "loss": 1.7069, + "step": 162 + }, + { + "epoch": 0.3, + "grad_norm": 0.0451313816010952, + "learning_rate": 0.00019570449196874815, + "loss": 1.7392, + "step": 163 + }, + { + "epoch": 0.3, + "grad_norm": 0.04682654142379761, + "learning_rate": 0.0001956485701710938, + "loss": 1.6987, + "step": 164 + }, + { + "epoch": 0.3, + "grad_norm": 0.04211273416876793, + "learning_rate": 0.00019559229480459474, + "loss": 1.6973, + "step": 165 + }, + { + "epoch": 0.3, + "grad_norm": 0.04460490494966507, + "learning_rate": 0.00019553566607727517, + "loss": 1.7233, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.044608812779188156, + "learning_rate": 0.00019547868419846548, + "loss": 1.7371, + "step": 167 + }, + { + "epoch": 0.31, + "grad_norm": 0.04518236592411995, + "learning_rate": 0.00019542134937880154, + "loss": 1.7257, + "step": 168 + }, + { + "epoch": 0.31, + "grad_norm": 0.04374237731099129, + "learning_rate": 0.00019536366183022384, + "loss": 1.7136, + "step": 169 + }, + { + "epoch": 0.31, + "grad_norm": 0.04429790750145912, + "learning_rate": 0.00019530562176597673, + "loss": 1.7216, + "step": 170 + }, + { + "epoch": 0.31, + "grad_norm": 0.04807354509830475, + "learning_rate": 0.0001952472294006077, + "loss": 1.6568, + "step": 171 + }, + { + "epoch": 0.31, + "grad_norm": 0.04785493016242981, + "learning_rate": 0.00019518848494996655, + "loss": 1.7272, + "step": 172 + }, + { + "epoch": 0.32, + "grad_norm": 0.04472104460000992, + "learning_rate": 0.0001951293886312045, + "loss": 1.7283, + "step": 173 + }, + { + "epoch": 0.32, + "grad_norm": 0.04852326214313507, + "learning_rate": 0.00019506994066277348, + "loss": 1.6968, + "step": 174 + }, + { + "epoch": 0.32, + "grad_norm": 0.04624422639608383, + "learning_rate": 0.0001950101412644254, + "loss": 1.758, + "step": 175 + }, + { + "epoch": 0.32, + "grad_norm": 0.044666189700365067, + "learning_rate": 0.00019494999065721108, + "loss": 1.6933, + "step": 176 + }, + { + "epoch": 0.32, + "grad_norm": 0.05367857217788696, + "learning_rate": 0.0001948894890634798, + "loss": 1.7328, + "step": 177 + }, + { + "epoch": 0.32, + "grad_norm": 0.046923939138650894, + "learning_rate": 0.0001948286367068781, + "loss": 1.7367, + "step": 178 + }, + { + "epoch": 0.33, + "grad_norm": 0.04480034112930298, + "learning_rate": 0.00019476743381234926, + "loss": 1.7677, + "step": 179 + }, + { + "epoch": 0.33, + "grad_norm": 0.045380428433418274, + "learning_rate": 0.00019470588060613222, + "loss": 1.7439, + "step": 180 + }, + { + "epoch": 0.33, + "grad_norm": 0.04550057277083397, + "learning_rate": 0.00019464397731576094, + "loss": 1.6895, + "step": 181 + }, + { + "epoch": 0.33, + "grad_norm": 0.049537234008312225, + "learning_rate": 0.00019458172417006347, + "loss": 1.7274, + "step": 182 + }, + { + "epoch": 0.33, + "grad_norm": 0.04696514084935188, + "learning_rate": 0.0001945191213991611, + "loss": 1.7121, + "step": 183 + }, + { + "epoch": 0.34, + "grad_norm": 0.04783783480525017, + "learning_rate": 0.00019445616923446755, + "loss": 1.6942, + "step": 184 + }, + { + "epoch": 0.34, + "grad_norm": 0.04514686018228531, + "learning_rate": 0.00019439286790868802, + "loss": 1.7219, + "step": 185 + }, + { + "epoch": 0.34, + "grad_norm": 0.045743513852357864, + "learning_rate": 0.00019432921765581847, + "loss": 1.76, + "step": 186 + }, + { + "epoch": 0.34, + "grad_norm": 0.04406295716762543, + "learning_rate": 0.00019426521871114468, + "loss": 1.7531, + "step": 187 + }, + { + "epoch": 0.34, + "grad_norm": 0.04445353150367737, + "learning_rate": 0.00019420087131124131, + "loss": 1.7742, + "step": 188 + }, + { + "epoch": 0.34, + "grad_norm": 0.04396241530776024, + "learning_rate": 0.0001941361756939712, + "loss": 1.7701, + "step": 189 + }, + { + "epoch": 0.35, + "grad_norm": 0.04415050894021988, + "learning_rate": 0.0001940711320984843, + "loss": 1.7062, + "step": 190 + }, + { + "epoch": 0.35, + "grad_norm": 0.04672138765454292, + "learning_rate": 0.00019400574076521693, + "loss": 1.754, + "step": 191 + }, + { + "epoch": 0.35, + "grad_norm": 0.04417939484119415, + "learning_rate": 0.00019394000193589088, + "loss": 1.7357, + "step": 192 + }, + { + "epoch": 0.35, + "grad_norm": 0.04567494988441467, + "learning_rate": 0.00019387391585351234, + "loss": 1.752, + "step": 193 + }, + { + "epoch": 0.35, + "grad_norm": 0.045080311596393585, + "learning_rate": 0.00019380748276237123, + "loss": 1.736, + "step": 194 + }, + { + "epoch": 0.36, + "grad_norm": 0.04506627842783928, + "learning_rate": 0.0001937407029080402, + "loss": 1.6726, + "step": 195 + }, + { + "epoch": 0.36, + "grad_norm": 0.04523961618542671, + "learning_rate": 0.0001936735765373737, + "loss": 1.7621, + "step": 196 + }, + { + "epoch": 0.36, + "grad_norm": 0.04326867312192917, + "learning_rate": 0.00019360610389850712, + "loss": 1.7341, + "step": 197 + }, + { + "epoch": 0.36, + "grad_norm": 0.05188523977994919, + "learning_rate": 0.00019353828524085577, + "loss": 1.7277, + "step": 198 + }, + { + "epoch": 0.36, + "grad_norm": 0.04654062166810036, + "learning_rate": 0.00019347012081511415, + "loss": 1.6845, + "step": 199 + }, + { + "epoch": 0.36, + "grad_norm": 0.044841405004262924, + "learning_rate": 0.0001934016108732548, + "loss": 1.6611, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 0.0941338911652565, + "learning_rate": 0.00019333275566852756, + "loss": 1.6978, + "step": 201 + }, + { + "epoch": 0.37, + "grad_norm": 0.05048836022615433, + "learning_rate": 0.00019326355545545845, + "loss": 1.7056, + "step": 202 + }, + { + "epoch": 0.37, + "grad_norm": 0.046358656138181686, + "learning_rate": 0.00019319401048984892, + "loss": 1.649, + "step": 203 + }, + { + "epoch": 0.37, + "grad_norm": 0.04557095095515251, + "learning_rate": 0.00019312412102877473, + "loss": 1.6793, + "step": 204 + }, + { + "epoch": 0.37, + "grad_norm": 0.04551040008664131, + "learning_rate": 0.0001930538873305852, + "loss": 1.7339, + "step": 205 + }, + { + "epoch": 0.38, + "grad_norm": 0.044258005917072296, + "learning_rate": 0.000192983309654902, + "loss": 1.6627, + "step": 206 + }, + { + "epoch": 0.38, + "grad_norm": 0.0485963337123394, + "learning_rate": 0.00019291238826261843, + "loss": 1.715, + "step": 207 + }, + { + "epoch": 0.38, + "grad_norm": 0.047103844583034515, + "learning_rate": 0.00019284112341589832, + "loss": 1.6855, + "step": 208 + }, + { + "epoch": 0.38, + "grad_norm": 0.045252177864313126, + "learning_rate": 0.000192769515378175, + "loss": 1.7557, + "step": 209 + }, + { + "epoch": 0.38, + "grad_norm": 0.049794841557741165, + "learning_rate": 0.00019269756441415062, + "loss": 1.7116, + "step": 210 + }, + { + "epoch": 0.38, + "grad_norm": 0.04380947723984718, + "learning_rate": 0.00019262527078979478, + "loss": 1.7663, + "step": 211 + }, + { + "epoch": 0.39, + "grad_norm": 0.046488065272569656, + "learning_rate": 0.00019255263477234381, + "loss": 1.6724, + "step": 212 + }, + { + "epoch": 0.39, + "grad_norm": 0.0422043539583683, + "learning_rate": 0.00019247965663029976, + "loss": 1.7345, + "step": 213 + }, + { + "epoch": 0.39, + "grad_norm": 0.05002991482615471, + "learning_rate": 0.0001924063366334293, + "loss": 1.7468, + "step": 214 + }, + { + "epoch": 0.39, + "grad_norm": 0.04376322776079178, + "learning_rate": 0.0001923326750527628, + "loss": 1.7748, + "step": 215 + }, + { + "epoch": 0.39, + "grad_norm": 0.04664807394146919, + "learning_rate": 0.00019225867216059325, + "loss": 1.7156, + "step": 216 + }, + { + "epoch": 0.4, + "grad_norm": 0.047952812165021896, + "learning_rate": 0.0001921843282304754, + "loss": 1.7247, + "step": 217 + }, + { + "epoch": 0.4, + "grad_norm": 0.045118216425180435, + "learning_rate": 0.00019210964353722464, + "loss": 1.7354, + "step": 218 + }, + { + "epoch": 0.4, + "grad_norm": 0.054903436452150345, + "learning_rate": 0.00019203461835691594, + "loss": 1.7241, + "step": 219 + }, + { + "epoch": 0.4, + "grad_norm": 0.04747498407959938, + "learning_rate": 0.000191959252966883, + "loss": 1.7498, + "step": 220 + }, + { + "epoch": 0.4, + "grad_norm": 0.04605628177523613, + "learning_rate": 0.000191883547645717, + "loss": 1.6889, + "step": 221 + }, + { + "epoch": 0.4, + "grad_norm": 0.04835960268974304, + "learning_rate": 0.00019180750267326578, + "loss": 1.715, + "step": 222 + }, + { + "epoch": 0.41, + "grad_norm": 0.04828386381268501, + "learning_rate": 0.00019173111833063273, + "loss": 1.6931, + "step": 223 + }, + { + "epoch": 0.41, + "grad_norm": 0.04604095220565796, + "learning_rate": 0.0001916543949001756, + "loss": 1.6717, + "step": 224 + }, + { + "epoch": 0.41, + "grad_norm": 0.049674633890390396, + "learning_rate": 0.00019157733266550575, + "loss": 1.7746, + "step": 225 + }, + { + "epoch": 0.41, + "grad_norm": 0.04439341649413109, + "learning_rate": 0.00019149993191148687, + "loss": 1.6925, + "step": 226 + }, + { + "epoch": 0.41, + "grad_norm": 0.04741811007261276, + "learning_rate": 0.00019142219292423395, + "loss": 1.7219, + "step": 227 + }, + { + "epoch": 0.42, + "grad_norm": 0.049409981817007065, + "learning_rate": 0.00019134411599111242, + "loss": 1.7306, + "step": 228 + }, + { + "epoch": 0.42, + "grad_norm": 0.04618163779377937, + "learning_rate": 0.00019126570140073676, + "loss": 1.7271, + "step": 229 + }, + { + "epoch": 0.42, + "grad_norm": 0.04557076469063759, + "learning_rate": 0.0001911869494429698, + "loss": 1.7188, + "step": 230 + }, + { + "epoch": 0.42, + "grad_norm": 0.04645569249987602, + "learning_rate": 0.0001911078604089213, + "loss": 1.7191, + "step": 231 + }, + { + "epoch": 0.42, + "grad_norm": 0.04584849998354912, + "learning_rate": 0.0001910284345909471, + "loss": 1.7592, + "step": 232 + }, + { + "epoch": 0.42, + "grad_norm": 0.045582644641399384, + "learning_rate": 0.000190948672282648, + "loss": 1.6902, + "step": 233 + }, + { + "epoch": 0.43, + "grad_norm": 0.04627401754260063, + "learning_rate": 0.00019086857377886865, + "loss": 1.6937, + "step": 234 + }, + { + "epoch": 0.43, + "grad_norm": 0.04470285400748253, + "learning_rate": 0.00019078813937569643, + "loss": 1.6977, + "step": 235 + }, + { + "epoch": 0.43, + "grad_norm": 0.05287547782063484, + "learning_rate": 0.00019070736937046035, + "loss": 1.7539, + "step": 236 + }, + { + "epoch": 0.43, + "grad_norm": 0.04990493878722191, + "learning_rate": 0.00019062626406173006, + "loss": 1.7469, + "step": 237 + }, + { + "epoch": 0.43, + "grad_norm": 0.048645589500665665, + "learning_rate": 0.00019054482374931467, + "loss": 1.7037, + "step": 238 + }, + { + "epoch": 0.44, + "grad_norm": 0.04730357602238655, + "learning_rate": 0.0001904630487342616, + "loss": 1.7388, + "step": 239 + }, + { + "epoch": 0.44, + "grad_norm": 0.04754168912768364, + "learning_rate": 0.00019038093931885553, + "loss": 1.7805, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 0.04760801047086716, + "learning_rate": 0.00019029849580661727, + "loss": 1.7383, + "step": 241 + }, + { + "epoch": 0.44, + "grad_norm": 0.048467203974723816, + "learning_rate": 0.0001902157185023026, + "loss": 1.7078, + "step": 242 + }, + { + "epoch": 0.44, + "grad_norm": 0.0522041916847229, + "learning_rate": 0.00019013260771190126, + "loss": 1.7052, + "step": 243 + }, + { + "epoch": 0.44, + "grad_norm": 0.0501788929104805, + "learning_rate": 0.00019004916374263563, + "loss": 1.7818, + "step": 244 + }, + { + "epoch": 0.45, + "grad_norm": 0.04538620635867119, + "learning_rate": 0.00018996538690295979, + "loss": 1.6589, + "step": 245 + }, + { + "epoch": 0.45, + "grad_norm": 0.04511679336428642, + "learning_rate": 0.00018988127750255824, + "loss": 1.7179, + "step": 246 + }, + { + "epoch": 0.45, + "grad_norm": 0.04756203666329384, + "learning_rate": 0.0001897968358523448, + "loss": 1.7333, + "step": 247 + }, + { + "epoch": 0.45, + "grad_norm": 0.05278336629271507, + "learning_rate": 0.00018971206226446147, + "loss": 1.7431, + "step": 248 + }, + { + "epoch": 0.45, + "grad_norm": 0.05926801264286041, + "learning_rate": 0.00018962695705227728, + "loss": 1.7768, + "step": 249 + }, + { + "epoch": 0.46, + "grad_norm": 0.049290940165519714, + "learning_rate": 0.00018954152053038712, + "loss": 1.7119, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 0.04777907952666283, + "learning_rate": 0.0001894557530146106, + "loss": 1.7559, + "step": 251 + }, + { + "epoch": 0.46, + "grad_norm": 0.04726920276880264, + "learning_rate": 0.00018936965482199084, + "loss": 1.7861, + "step": 252 + }, + { + "epoch": 0.46, + "grad_norm": 0.04677857458591461, + "learning_rate": 0.0001892832262707933, + "loss": 1.7039, + "step": 253 + }, + { + "epoch": 0.46, + "grad_norm": 0.04724700003862381, + "learning_rate": 0.00018919646768050468, + "loss": 1.6704, + "step": 254 + }, + { + "epoch": 0.46, + "grad_norm": 0.04969072341918945, + "learning_rate": 0.00018910937937183166, + "loss": 1.7168, + "step": 255 + }, + { + "epoch": 0.47, + "grad_norm": 0.04533353075385094, + "learning_rate": 0.0001890219616666997, + "loss": 1.6751, + "step": 256 + }, + { + "epoch": 0.47, + "grad_norm": 0.04647386819124222, + "learning_rate": 0.0001889342148882519, + "loss": 1.7146, + "step": 257 + }, + { + "epoch": 0.47, + "grad_norm": 0.047208696603775024, + "learning_rate": 0.00018884613936084784, + "loss": 1.7378, + "step": 258 + }, + { + "epoch": 0.47, + "grad_norm": 0.04841624200344086, + "learning_rate": 0.0001887577354100623, + "loss": 1.7128, + "step": 259 + }, + { + "epoch": 0.47, + "grad_norm": 0.05073019117116928, + "learning_rate": 0.00018866900336268408, + "loss": 1.7206, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 0.051456011831760406, + "learning_rate": 0.00018857994354671482, + "loss": 1.755, + "step": 261 + }, + { + "epoch": 0.48, + "grad_norm": 0.04637736827135086, + "learning_rate": 0.0001884905562913678, + "loss": 1.7395, + "step": 262 + }, + { + "epoch": 0.48, + "grad_norm": 0.061346374452114105, + "learning_rate": 0.00018840084192706658, + "loss": 1.674, + "step": 263 + }, + { + "epoch": 0.48, + "grad_norm": 0.04413258284330368, + "learning_rate": 0.00018831080078544402, + "loss": 1.7288, + "step": 264 + }, + { + "epoch": 0.48, + "grad_norm": 0.0531301349401474, + "learning_rate": 0.0001882204331993409, + "loss": 1.7625, + "step": 265 + }, + { + "epoch": 0.48, + "grad_norm": 0.05146196484565735, + "learning_rate": 0.00018812973950280468, + "loss": 1.6815, + "step": 266 + }, + { + "epoch": 0.49, + "grad_norm": 0.047678787261247635, + "learning_rate": 0.0001880387200310883, + "loss": 1.7278, + "step": 267 + }, + { + "epoch": 0.49, + "grad_norm": 0.0556582510471344, + "learning_rate": 0.0001879473751206489, + "loss": 1.74, + "step": 268 + }, + { + "epoch": 0.49, + "grad_norm": 0.047515787184238434, + "learning_rate": 0.00018785570510914678, + "loss": 1.7207, + "step": 269 + }, + { + "epoch": 0.49, + "grad_norm": 0.04592055827379227, + "learning_rate": 0.0001877637103354438, + "loss": 1.6589, + "step": 270 + }, + { + "epoch": 0.49, + "grad_norm": 0.04531411454081535, + "learning_rate": 0.0001876713911396024, + "loss": 1.706, + "step": 271 + }, + { + "epoch": 0.5, + "grad_norm": 0.04682420939207077, + "learning_rate": 0.0001875787478628843, + "loss": 1.7297, + "step": 272 + }, + { + "epoch": 0.5, + "grad_norm": 0.04545978829264641, + "learning_rate": 0.00018748578084774913, + "loss": 1.6572, + "step": 273 + }, + { + "epoch": 0.5, + "grad_norm": 0.04849430173635483, + "learning_rate": 0.00018739249043785324, + "loss": 1.7442, + "step": 274 + }, + { + "epoch": 0.5, + "eval_loss": 1.726025938987732, + "eval_runtime": 76.0967, + "eval_samples_per_second": 65.706, + "eval_steps_per_second": 16.426, + "step": 274 + }, + { + "epoch": 0.5, + "grad_norm": 0.04745488613843918, + "learning_rate": 0.00018729887697804847, + "loss": 1.7398, + "step": 275 + }, + { + "epoch": 0.5, + "grad_norm": 0.05489857494831085, + "learning_rate": 0.00018720494081438078, + "loss": 1.701, + "step": 276 + }, + { + "epoch": 0.51, + "grad_norm": 0.04818108305335045, + "learning_rate": 0.00018711068229408903, + "loss": 1.7068, + "step": 277 + }, + { + "epoch": 0.51, + "grad_norm": 0.04530555009841919, + "learning_rate": 0.0001870161017656037, + "loss": 1.6966, + "step": 278 + }, + { + "epoch": 0.51, + "grad_norm": 0.045606572180986404, + "learning_rate": 0.00018692119957854558, + "loss": 1.7086, + "step": 279 + }, + { + "epoch": 0.51, + "grad_norm": 0.04626869410276413, + "learning_rate": 0.00018682597608372445, + "loss": 1.6981, + "step": 280 + }, + { + "epoch": 0.51, + "grad_norm": 0.04752146080136299, + "learning_rate": 0.0001867304316331379, + "loss": 1.692, + "step": 281 + }, + { + "epoch": 0.51, + "grad_norm": 0.046230729669332504, + "learning_rate": 0.0001866345665799698, + "loss": 1.7338, + "step": 282 + }, + { + "epoch": 0.52, + "grad_norm": 0.04928119108080864, + "learning_rate": 0.00018653838127858933, + "loss": 1.738, + "step": 283 + }, + { + "epoch": 0.52, + "grad_norm": 0.04641352593898773, + "learning_rate": 0.00018644187608454936, + "loss": 1.6792, + "step": 284 + }, + { + "epoch": 0.52, + "grad_norm": 0.04860611632466316, + "learning_rate": 0.00018634505135458525, + "loss": 1.663, + "step": 285 + }, + { + "epoch": 0.52, + "grad_norm": 0.046515002846717834, + "learning_rate": 0.00018624790744661355, + "loss": 1.7327, + "step": 286 + }, + { + "epoch": 0.52, + "grad_norm": 0.04668186604976654, + "learning_rate": 0.00018615044471973074, + "loss": 1.6987, + "step": 287 + }, + { + "epoch": 0.53, + "grad_norm": 0.047913163900375366, + "learning_rate": 0.00018605266353421176, + "loss": 1.7953, + "step": 288 + }, + { + "epoch": 0.53, + "grad_norm": 0.04924839362502098, + "learning_rate": 0.00018595456425150872, + "loss": 1.7891, + "step": 289 + }, + { + "epoch": 0.53, + "grad_norm": 0.049241986125707626, + "learning_rate": 0.00018585614723424962, + "loss": 1.7451, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 0.05132036283612251, + "learning_rate": 0.00018575741284623703, + "loss": 1.7598, + "step": 291 + }, + { + "epoch": 0.53, + "grad_norm": 0.04659922048449516, + "learning_rate": 0.00018565836145244662, + "loss": 1.7331, + "step": 292 + }, + { + "epoch": 0.53, + "grad_norm": 0.0466977022588253, + "learning_rate": 0.0001855589934190259, + "loss": 1.7171, + "step": 293 + }, + { + "epoch": 0.54, + "grad_norm": 0.049368374049663544, + "learning_rate": 0.00018545930911329287, + "loss": 1.6929, + "step": 294 + }, + { + "epoch": 0.54, + "grad_norm": 0.04552480950951576, + "learning_rate": 0.00018535930890373466, + "loss": 1.753, + "step": 295 + }, + { + "epoch": 0.54, + "grad_norm": 0.04755065590143204, + "learning_rate": 0.00018525899316000608, + "loss": 1.7472, + "step": 296 + }, + { + "epoch": 0.54, + "grad_norm": 0.050540413707494736, + "learning_rate": 0.0001851583622529284, + "loss": 1.7585, + "step": 297 + }, + { + "epoch": 0.54, + "grad_norm": 0.04644971713423729, + "learning_rate": 0.00018505741655448792, + "loss": 1.7531, + "step": 298 + }, + { + "epoch": 0.55, + "grad_norm": 0.05085503309965134, + "learning_rate": 0.00018495615643783446, + "loss": 1.6954, + "step": 299 + }, + { + "epoch": 0.55, + "grad_norm": 0.0480993427336216, + "learning_rate": 0.0001848545822772802, + "loss": 1.6976, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 0.0487300269305706, + "learning_rate": 0.00018475269444829818, + "loss": 1.7642, + "step": 301 + }, + { + "epoch": 0.55, + "grad_norm": 0.04805615171790123, + "learning_rate": 0.0001846504933275209, + "loss": 1.6666, + "step": 302 + }, + { + "epoch": 0.55, + "grad_norm": 0.045554857701063156, + "learning_rate": 0.00018454797929273902, + "loss": 1.7259, + "step": 303 + }, + { + "epoch": 0.55, + "grad_norm": 0.04570743814110756, + "learning_rate": 0.00018444515272289982, + "loss": 1.7067, + "step": 304 + }, + { + "epoch": 0.56, + "grad_norm": 0.047652073204517365, + "learning_rate": 0.00018434201399810594, + "loss": 1.8147, + "step": 305 + }, + { + "epoch": 0.56, + "grad_norm": 0.046781569719314575, + "learning_rate": 0.00018423856349961384, + "loss": 1.7509, + "step": 306 + }, + { + "epoch": 0.56, + "grad_norm": 0.04698612168431282, + "learning_rate": 0.00018413480160983254, + "loss": 1.7074, + "step": 307 + }, + { + "epoch": 0.56, + "grad_norm": 0.04796341061592102, + "learning_rate": 0.0001840307287123221, + "loss": 1.7444, + "step": 308 + }, + { + "epoch": 0.56, + "grad_norm": 0.047553375363349915, + "learning_rate": 0.00018392634519179225, + "loss": 1.7103, + "step": 309 + }, + { + "epoch": 0.57, + "grad_norm": 0.046323925256729126, + "learning_rate": 0.00018382165143410092, + "loss": 1.716, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 0.04571986570954323, + "learning_rate": 0.00018371664782625287, + "loss": 1.7035, + "step": 311 + }, + { + "epoch": 0.57, + "grad_norm": 0.05170504003763199, + "learning_rate": 0.0001836113347563982, + "loss": 1.7151, + "step": 312 + }, + { + "epoch": 0.57, + "grad_norm": 0.047869808971881866, + "learning_rate": 0.000183505712613831, + "loss": 1.7223, + "step": 313 + }, + { + "epoch": 0.57, + "grad_norm": 0.0482964813709259, + "learning_rate": 0.0001833997817889878, + "loss": 1.6805, + "step": 314 + }, + { + "epoch": 0.57, + "grad_norm": 0.0486602708697319, + "learning_rate": 0.00018329354267344625, + "loss": 1.7303, + "step": 315 + }, + { + "epoch": 0.58, + "grad_norm": 0.046554964035749435, + "learning_rate": 0.00018318699565992357, + "loss": 1.7745, + "step": 316 + }, + { + "epoch": 0.58, + "grad_norm": 0.047917045652866364, + "learning_rate": 0.00018308014114227513, + "loss": 1.718, + "step": 317 + }, + { + "epoch": 0.58, + "grad_norm": 0.0479004867374897, + "learning_rate": 0.00018297297951549304, + "loss": 1.7707, + "step": 318 + }, + { + "epoch": 0.58, + "grad_norm": 0.04681101068854332, + "learning_rate": 0.0001828655111757046, + "loss": 1.7646, + "step": 319 + }, + { + "epoch": 0.58, + "grad_norm": 0.05201521888375282, + "learning_rate": 0.00018275773652017097, + "loss": 1.7479, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 0.04852493852376938, + "learning_rate": 0.00018264965594728548, + "loss": 1.7463, + "step": 321 + }, + { + "epoch": 0.59, + "grad_norm": 0.046121757477521896, + "learning_rate": 0.00018254126985657246, + "loss": 1.7444, + "step": 322 + }, + { + "epoch": 0.59, + "grad_norm": 0.05163992941379547, + "learning_rate": 0.00018243257864868548, + "loss": 1.7134, + "step": 323 + }, + { + "epoch": 0.59, + "grad_norm": 0.06267976760864258, + "learning_rate": 0.00018232358272540604, + "loss": 1.6712, + "step": 324 + }, + { + "epoch": 0.59, + "grad_norm": 0.04854287579655647, + "learning_rate": 0.00018221428248964202, + "loss": 1.6932, + "step": 325 + }, + { + "epoch": 0.59, + "grad_norm": 0.046650100499391556, + "learning_rate": 0.00018210467834542615, + "loss": 1.768, + "step": 326 + }, + { + "epoch": 0.6, + "grad_norm": 0.04779491573572159, + "learning_rate": 0.00018199477069791474, + "loss": 1.7109, + "step": 327 + }, + { + "epoch": 0.6, + "grad_norm": 0.05170130729675293, + "learning_rate": 0.0001818845599533858, + "loss": 1.6926, + "step": 328 + }, + { + "epoch": 0.6, + "grad_norm": 0.04867775738239288, + "learning_rate": 0.00018177404651923787, + "loss": 1.6908, + "step": 329 + }, + { + "epoch": 0.6, + "grad_norm": 0.04707460105419159, + "learning_rate": 0.00018166323080398835, + "loss": 1.7461, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 0.048908475786447525, + "learning_rate": 0.00018155211321727212, + "loss": 1.7214, + "step": 331 + }, + { + "epoch": 0.61, + "grad_norm": 0.04802173003554344, + "learning_rate": 0.00018144069416983985, + "loss": 1.7528, + "step": 332 + }, + { + "epoch": 0.61, + "grad_norm": 0.04747573658823967, + "learning_rate": 0.00018132897407355657, + "loss": 1.6726, + "step": 333 + }, + { + "epoch": 0.61, + "grad_norm": 0.049620069563388824, + "learning_rate": 0.00018121695334140017, + "loss": 1.7215, + "step": 334 + }, + { + "epoch": 0.61, + "grad_norm": 0.047733817249536514, + "learning_rate": 0.00018110463238745988, + "loss": 1.7538, + "step": 335 + }, + { + "epoch": 0.61, + "grad_norm": 0.04856455698609352, + "learning_rate": 0.00018099201162693476, + "loss": 1.6833, + "step": 336 + }, + { + "epoch": 0.61, + "grad_norm": 0.04885758087038994, + "learning_rate": 0.00018087909147613193, + "loss": 1.7141, + "step": 337 + }, + { + "epoch": 0.62, + "grad_norm": 0.047947369515895844, + "learning_rate": 0.0001807658723524654, + "loss": 1.733, + "step": 338 + }, + { + "epoch": 0.62, + "grad_norm": 0.0499010868370533, + "learning_rate": 0.0001806523546744543, + "loss": 1.6825, + "step": 339 + }, + { + "epoch": 0.62, + "grad_norm": 0.048193834722042084, + "learning_rate": 0.0001805385388617213, + "loss": 1.7282, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 0.05272866412997246, + "learning_rate": 0.00018042442533499123, + "loss": 1.7599, + "step": 341 + }, + { + "epoch": 0.62, + "grad_norm": 0.047657158225774765, + "learning_rate": 0.00018031001451608943, + "loss": 1.7292, + "step": 342 + }, + { + "epoch": 0.63, + "grad_norm": 0.0498197004199028, + "learning_rate": 0.00018019530682794014, + "loss": 1.7417, + "step": 343 + }, + { + "epoch": 0.63, + "grad_norm": 0.04958554729819298, + "learning_rate": 0.00018008030269456505, + "loss": 1.7274, + "step": 344 + }, + { + "epoch": 0.63, + "grad_norm": 0.04730832576751709, + "learning_rate": 0.00017996500254108152, + "loss": 1.778, + "step": 345 + }, + { + "epoch": 0.63, + "grad_norm": 0.050828639417886734, + "learning_rate": 0.0001798494067937014, + "loss": 1.7285, + "step": 346 + }, + { + "epoch": 0.63, + "grad_norm": 0.046292368322610855, + "learning_rate": 0.00017973351587972905, + "loss": 1.7334, + "step": 347 + }, + { + "epoch": 0.63, + "grad_norm": 0.04758565500378609, + "learning_rate": 0.00017961733022755992, + "loss": 1.6814, + "step": 348 + }, + { + "epoch": 0.64, + "grad_norm": 0.050507742911577225, + "learning_rate": 0.00017950085026667903, + "loss": 1.6949, + "step": 349 + }, + { + "epoch": 0.64, + "grad_norm": 0.04801836982369423, + "learning_rate": 0.00017938407642765938, + "loss": 1.6594, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 0.04616666957736015, + "learning_rate": 0.00017926700914216016, + "loss": 1.6969, + "step": 351 + }, + { + "epoch": 0.64, + "grad_norm": 0.048213839530944824, + "learning_rate": 0.00017914964884292544, + "loss": 1.6908, + "step": 352 + }, + { + "epoch": 0.64, + "grad_norm": 0.04909725859761238, + "learning_rate": 0.00017903199596378227, + "loss": 1.7213, + "step": 353 + }, + { + "epoch": 0.65, + "grad_norm": 0.050252340734004974, + "learning_rate": 0.00017891405093963938, + "loss": 1.7094, + "step": 354 + }, + { + "epoch": 0.65, + "grad_norm": 0.05401075631380081, + "learning_rate": 0.00017879581420648534, + "loss": 1.7163, + "step": 355 + }, + { + "epoch": 0.65, + "grad_norm": 0.05027545616030693, + "learning_rate": 0.00017867728620138708, + "loss": 1.7362, + "step": 356 + }, + { + "epoch": 0.65, + "grad_norm": 0.047479428350925446, + "learning_rate": 0.00017855846736248822, + "loss": 1.6785, + "step": 357 + }, + { + "epoch": 0.65, + "grad_norm": 0.05026884377002716, + "learning_rate": 0.0001784393581290074, + "loss": 1.7221, + "step": 358 + }, + { + "epoch": 0.65, + "grad_norm": 0.04901432618498802, + "learning_rate": 0.00017831995894123683, + "loss": 1.6401, + "step": 359 + }, + { + "epoch": 0.66, + "grad_norm": 0.04764765873551369, + "learning_rate": 0.00017820027024054044, + "loss": 1.7361, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 0.046871528029441833, + "learning_rate": 0.0001780802924693524, + "loss": 1.7986, + "step": 361 + }, + { + "epoch": 0.66, + "grad_norm": 0.05453401803970337, + "learning_rate": 0.00017796002607117545, + "loss": 1.7447, + "step": 362 + }, + { + "epoch": 0.66, + "grad_norm": 0.04958674684166908, + "learning_rate": 0.00017783947149057925, + "loss": 1.7091, + "step": 363 + }, + { + "epoch": 0.66, + "grad_norm": 0.053141675889492035, + "learning_rate": 0.0001777186291731987, + "loss": 1.6866, + "step": 364 + }, + { + "epoch": 0.67, + "grad_norm": 0.047340743243694305, + "learning_rate": 0.00017759749956573238, + "loss": 1.7191, + "step": 365 + }, + { + "epoch": 0.67, + "grad_norm": 0.051203418523073196, + "learning_rate": 0.00017747608311594087, + "loss": 1.7238, + "step": 366 + }, + { + "epoch": 0.67, + "grad_norm": 0.047188933938741684, + "learning_rate": 0.00017735438027264495, + "loss": 1.762, + "step": 367 + }, + { + "epoch": 0.67, + "grad_norm": 0.056479763239622116, + "learning_rate": 0.00017723239148572422, + "loss": 1.6587, + "step": 368 + }, + { + "epoch": 0.67, + "grad_norm": 0.04922572523355484, + "learning_rate": 0.00017711011720611514, + "loss": 1.6988, + "step": 369 + }, + { + "epoch": 0.67, + "grad_norm": 0.046839334070682526, + "learning_rate": 0.00017698755788580963, + "loss": 1.7092, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 0.0491393506526947, + "learning_rate": 0.0001768647139778532, + "loss": 1.7313, + "step": 371 + }, + { + "epoch": 0.68, + "grad_norm": 0.04811710864305496, + "learning_rate": 0.0001767415859363434, + "loss": 1.8071, + "step": 372 + }, + { + "epoch": 0.68, + "grad_norm": 0.04601633548736572, + "learning_rate": 0.00017661817421642804, + "loss": 1.7594, + "step": 373 + }, + { + "epoch": 0.68, + "grad_norm": 0.05098440870642662, + "learning_rate": 0.00017649447927430362, + "loss": 1.6524, + "step": 374 + }, + { + "epoch": 0.68, + "grad_norm": 0.04978582262992859, + "learning_rate": 0.00017637050156721346, + "loss": 1.7448, + "step": 375 + }, + { + "epoch": 0.69, + "grad_norm": 0.05097389221191406, + "learning_rate": 0.00017624624155344626, + "loss": 1.7362, + "step": 376 + }, + { + "epoch": 0.69, + "grad_norm": 0.05258944630622864, + "learning_rate": 0.00017612169969233424, + "loss": 1.7033, + "step": 377 + }, + { + "epoch": 0.69, + "grad_norm": 0.05384654179215431, + "learning_rate": 0.0001759968764442515, + "loss": 1.6349, + "step": 378 + }, + { + "epoch": 0.69, + "grad_norm": 0.047803860157728195, + "learning_rate": 0.00017587177227061226, + "loss": 1.6655, + "step": 379 + }, + { + "epoch": 0.69, + "grad_norm": 0.04812454432249069, + "learning_rate": 0.00017574638763386916, + "loss": 1.7064, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 0.04860275238752365, + "learning_rate": 0.00017562072299751163, + "loss": 1.6648, + "step": 381 + }, + { + "epoch": 0.7, + "grad_norm": 0.049836620688438416, + "learning_rate": 0.00017549477882606418, + "loss": 1.6957, + "step": 382 + }, + { + "epoch": 0.7, + "grad_norm": 0.05114325135946274, + "learning_rate": 0.00017536855558508458, + "loss": 1.6257, + "step": 383 + }, + { + "epoch": 0.7, + "grad_norm": 0.054609425365924835, + "learning_rate": 0.00017524205374116214, + "loss": 1.6854, + "step": 384 + }, + { + "epoch": 0.7, + "grad_norm": 0.04757620766758919, + "learning_rate": 0.00017511527376191618, + "loss": 1.7425, + "step": 385 + }, + { + "epoch": 0.7, + "grad_norm": 0.05384545028209686, + "learning_rate": 0.00017498821611599397, + "loss": 1.712, + "step": 386 + }, + { + "epoch": 0.71, + "grad_norm": 0.04726232588291168, + "learning_rate": 0.00017486088127306932, + "loss": 1.701, + "step": 387 + }, + { + "epoch": 0.71, + "grad_norm": 0.04885297268629074, + "learning_rate": 0.0001747332697038407, + "loss": 1.7227, + "step": 388 + }, + { + "epoch": 0.71, + "grad_norm": 0.04793693870306015, + "learning_rate": 0.00017460538188002946, + "loss": 1.7058, + "step": 389 + }, + { + "epoch": 0.71, + "grad_norm": 0.04942973330616951, + "learning_rate": 0.0001744772182743782, + "loss": 1.7443, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 0.05246872082352638, + "learning_rate": 0.00017434877936064886, + "loss": 1.6807, + "step": 391 + }, + { + "epoch": 0.71, + "grad_norm": 0.04894121363759041, + "learning_rate": 0.0001742200656136212, + "loss": 1.7963, + "step": 392 + }, + { + "epoch": 0.72, + "grad_norm": 0.05082324892282486, + "learning_rate": 0.00017409107750909078, + "loss": 1.7024, + "step": 393 + }, + { + "epoch": 0.72, + "grad_norm": 0.04718152433633804, + "learning_rate": 0.00017396181552386741, + "loss": 1.711, + "step": 394 + }, + { + "epoch": 0.72, + "grad_norm": 0.05174902826547623, + "learning_rate": 0.00017383228013577331, + "loss": 1.7362, + "step": 395 + }, + { + "epoch": 0.72, + "grad_norm": 0.048003047704696655, + "learning_rate": 0.0001737024718236413, + "loss": 1.6944, + "step": 396 + }, + { + "epoch": 0.72, + "grad_norm": 0.0462164506316185, + "learning_rate": 0.00017357239106731317, + "loss": 1.7297, + "step": 397 + }, + { + "epoch": 0.73, + "grad_norm": 0.04808316007256508, + "learning_rate": 0.0001734420383476377, + "loss": 1.6971, + "step": 398 + }, + { + "epoch": 0.73, + "grad_norm": 0.05553476884961128, + "learning_rate": 0.00017331141414646904, + "loss": 1.7262, + "step": 399 + }, + { + "epoch": 0.73, + "grad_norm": 0.046341411769390106, + "learning_rate": 0.00017318051894666487, + "loss": 1.7135, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 0.048155754804611206, + "learning_rate": 0.00017304935323208466, + "loss": 1.7377, + "step": 401 + }, + { + "epoch": 0.73, + "grad_norm": 0.05066389963030815, + "learning_rate": 0.00017291791748758785, + "loss": 1.6516, + "step": 402 + }, + { + "epoch": 0.73, + "grad_norm": 0.05046610161662102, + "learning_rate": 0.000172786212199032, + "loss": 1.7536, + "step": 403 + }, + { + "epoch": 0.74, + "grad_norm": 0.0542440302670002, + "learning_rate": 0.00017265423785327107, + "loss": 1.7857, + "step": 404 + }, + { + "epoch": 0.74, + "grad_norm": 0.04833053797483444, + "learning_rate": 0.0001725219949381537, + "loss": 1.7594, + "step": 405 + }, + { + "epoch": 0.74, + "grad_norm": 0.047335654497146606, + "learning_rate": 0.00017238948394252115, + "loss": 1.7495, + "step": 406 + }, + { + "epoch": 0.74, + "grad_norm": 0.04961543157696724, + "learning_rate": 0.00017225670535620576, + "loss": 1.7201, + "step": 407 + }, + { + "epoch": 0.74, + "grad_norm": 0.04761854186654091, + "learning_rate": 0.00017212365967002893, + "loss": 1.7522, + "step": 408 + }, + { + "epoch": 0.75, + "grad_norm": 0.05010442063212395, + "learning_rate": 0.0001719903473757996, + "loss": 1.7535, + "step": 409 + }, + { + "epoch": 0.75, + "grad_norm": 0.049323149025440216, + "learning_rate": 0.000171856768966312, + "loss": 1.6984, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 0.08661342412233353, + "learning_rate": 0.0001717229249353442, + "loss": 1.7182, + "step": 411 + }, + { + "epoch": 0.75, + "eval_loss": 1.724851131439209, + "eval_runtime": 76.3068, + "eval_samples_per_second": 65.525, + "eval_steps_per_second": 16.381, + "step": 411 + }, + { + "epoch": 0.75, + "grad_norm": 0.05118868127465248, + "learning_rate": 0.00017158881577765612, + "loss": 1.683, + "step": 412 + }, + { + "epoch": 0.75, + "grad_norm": 0.053089968860149384, + "learning_rate": 0.00017145444198898776, + "loss": 1.7162, + "step": 413 + }, + { + "epoch": 0.75, + "grad_norm": 0.05191902816295624, + "learning_rate": 0.0001713198040660573, + "loss": 1.7223, + "step": 414 + }, + { + "epoch": 0.76, + "grad_norm": 0.05995416268706322, + "learning_rate": 0.00017118490250655932, + "loss": 1.7148, + "step": 415 + }, + { + "epoch": 0.76, + "grad_norm": 0.04749016463756561, + "learning_rate": 0.00017104973780916294, + "loss": 1.7364, + "step": 416 + }, + { + "epoch": 0.76, + "grad_norm": 0.047870930284261703, + "learning_rate": 0.00017091431047351, + "loss": 1.7607, + "step": 417 + }, + { + "epoch": 0.76, + "grad_norm": 0.04802364483475685, + "learning_rate": 0.00017077862100021318, + "loss": 1.6957, + "step": 418 + }, + { + "epoch": 0.76, + "grad_norm": 0.04796374961733818, + "learning_rate": 0.00017064266989085412, + "loss": 1.6972, + "step": 419 + }, + { + "epoch": 0.77, + "grad_norm": 0.048874564468860626, + "learning_rate": 0.00017050645764798164, + "loss": 1.736, + "step": 420 + }, + { + "epoch": 0.77, + "grad_norm": 0.052477337419986725, + "learning_rate": 0.00017036998477510992, + "loss": 1.7447, + "step": 421 + }, + { + "epoch": 0.77, + "grad_norm": 0.049993280321359634, + "learning_rate": 0.00017023325177671647, + "loss": 1.7635, + "step": 422 + }, + { + "epoch": 0.77, + "grad_norm": 0.09700744599103928, + "learning_rate": 0.00017009625915824037, + "loss": 1.7402, + "step": 423 + }, + { + "epoch": 0.77, + "grad_norm": 0.048865802586078644, + "learning_rate": 0.0001699590074260805, + "loss": 1.7229, + "step": 424 + }, + { + "epoch": 0.77, + "grad_norm": 0.04994821920990944, + "learning_rate": 0.00016982149708759343, + "loss": 1.672, + "step": 425 + }, + { + "epoch": 0.78, + "grad_norm": 0.05008814111351967, + "learning_rate": 0.00016968372865109176, + "loss": 1.7338, + "step": 426 + }, + { + "epoch": 0.78, + "grad_norm": 0.04830687865614891, + "learning_rate": 0.00016954570262584214, + "loss": 1.7177, + "step": 427 + }, + { + "epoch": 0.78, + "grad_norm": 0.04781452193856239, + "learning_rate": 0.0001694074195220634, + "loss": 1.7628, + "step": 428 + }, + { + "epoch": 0.78, + "grad_norm": 0.04739667847752571, + "learning_rate": 0.00016926887985092468, + "loss": 1.7107, + "step": 429 + }, + { + "epoch": 0.78, + "grad_norm": 0.0481286458671093, + "learning_rate": 0.00016913008412454357, + "loss": 1.7646, + "step": 430 + }, + { + "epoch": 0.79, + "grad_norm": 0.06283537298440933, + "learning_rate": 0.0001689910328559841, + "loss": 1.6896, + "step": 431 + }, + { + "epoch": 0.79, + "grad_norm": 0.04944480583071709, + "learning_rate": 0.00016885172655925495, + "loss": 1.6931, + "step": 432 + }, + { + "epoch": 0.79, + "grad_norm": 0.05051645264029503, + "learning_rate": 0.00016871216574930754, + "loss": 1.7752, + "step": 433 + }, + { + "epoch": 0.79, + "grad_norm": 0.05406402051448822, + "learning_rate": 0.0001685723509420341, + "loss": 1.7203, + "step": 434 + }, + { + "epoch": 0.79, + "grad_norm": 0.0995137020945549, + "learning_rate": 0.00016843228265426584, + "loss": 1.6454, + "step": 435 + }, + { + "epoch": 0.79, + "grad_norm": 0.05356389284133911, + "learning_rate": 0.00016829196140377085, + "loss": 1.7327, + "step": 436 + }, + { + "epoch": 0.8, + "grad_norm": 0.04902141913771629, + "learning_rate": 0.0001681513877092523, + "loss": 1.7262, + "step": 437 + }, + { + "epoch": 0.8, + "grad_norm": 0.047820378094911575, + "learning_rate": 0.00016801056209034672, + "loss": 1.7294, + "step": 438 + }, + { + "epoch": 0.8, + "grad_norm": 0.048359643667936325, + "learning_rate": 0.00016786948506762164, + "loss": 1.6959, + "step": 439 + }, + { + "epoch": 0.8, + "grad_norm": 0.04830753803253174, + "learning_rate": 0.00016772815716257412, + "loss": 1.7714, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 0.05318046733736992, + "learning_rate": 0.0001675865788976285, + "loss": 1.7325, + "step": 441 + }, + { + "epoch": 0.81, + "grad_norm": 0.04992082715034485, + "learning_rate": 0.0001674447507961346, + "loss": 1.7866, + "step": 442 + }, + { + "epoch": 0.81, + "grad_norm": 0.05253741890192032, + "learning_rate": 0.0001673026733823658, + "loss": 1.7273, + "step": 443 + }, + { + "epoch": 0.81, + "grad_norm": 0.05121272802352905, + "learning_rate": 0.00016716034718151706, + "loss": 1.7063, + "step": 444 + }, + { + "epoch": 0.81, + "grad_norm": 0.04715156927704811, + "learning_rate": 0.000167017772719703, + "loss": 1.7575, + "step": 445 + }, + { + "epoch": 0.81, + "grad_norm": 0.05717930197715759, + "learning_rate": 0.00016687495052395595, + "loss": 1.7835, + "step": 446 + }, + { + "epoch": 0.81, + "grad_norm": 0.04992460459470749, + "learning_rate": 0.00016673188112222394, + "loss": 1.7218, + "step": 447 + }, + { + "epoch": 0.82, + "grad_norm": 0.0481155663728714, + "learning_rate": 0.0001665885650433689, + "loss": 1.7269, + "step": 448 + }, + { + "epoch": 0.82, + "grad_norm": 0.0485762394964695, + "learning_rate": 0.00016644500281716456, + "loss": 1.6857, + "step": 449 + }, + { + "epoch": 0.82, + "grad_norm": 0.04729575663805008, + "learning_rate": 0.00016630119497429457, + "loss": 1.7208, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 0.051819782704114914, + "learning_rate": 0.00016615714204635043, + "loss": 1.7117, + "step": 451 + }, + { + "epoch": 0.82, + "grad_norm": 0.052782051265239716, + "learning_rate": 0.0001660128445658297, + "loss": 1.7811, + "step": 452 + }, + { + "epoch": 0.83, + "grad_norm": 0.05251288414001465, + "learning_rate": 0.00016586830306613393, + "loss": 1.7517, + "step": 453 + }, + { + "epoch": 0.83, + "grad_norm": 0.047806352376937866, + "learning_rate": 0.00016572351808156666, + "loss": 1.7132, + "step": 454 + }, + { + "epoch": 0.83, + "grad_norm": 0.05114049091935158, + "learning_rate": 0.0001655784901473315, + "loss": 1.7729, + "step": 455 + }, + { + "epoch": 0.83, + "grad_norm": 0.04811178147792816, + "learning_rate": 0.00016543321979953007, + "loss": 1.7855, + "step": 456 + }, + { + "epoch": 0.83, + "grad_norm": 0.05107167363166809, + "learning_rate": 0.00016528770757516027, + "loss": 1.7331, + "step": 457 + }, + { + "epoch": 0.84, + "grad_norm": 0.04712466895580292, + "learning_rate": 0.00016514195401211388, + "loss": 1.7048, + "step": 458 + }, + { + "epoch": 0.84, + "grad_norm": 0.05438878387212753, + "learning_rate": 0.0001649959596491749, + "loss": 1.753, + "step": 459 + }, + { + "epoch": 0.84, + "grad_norm": 0.04884348064661026, + "learning_rate": 0.00016484972502601753, + "loss": 1.6734, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 0.0536276139318943, + "learning_rate": 0.00016470325068320392, + "loss": 1.711, + "step": 461 + }, + { + "epoch": 0.84, + "grad_norm": 0.05346493422985077, + "learning_rate": 0.00016455653716218252, + "loss": 1.7366, + "step": 462 + }, + { + "epoch": 0.84, + "grad_norm": 0.05044522508978844, + "learning_rate": 0.0001644095850052858, + "loss": 1.7269, + "step": 463 + }, + { + "epoch": 0.85, + "grad_norm": 0.05273488536477089, + "learning_rate": 0.00016426239475572852, + "loss": 1.7586, + "step": 464 + }, + { + "epoch": 0.85, + "grad_norm": 0.053452517837285995, + "learning_rate": 0.0001641149669576053, + "loss": 1.7379, + "step": 465 + }, + { + "epoch": 0.85, + "grad_norm": 0.047611016780138016, + "learning_rate": 0.00016396730215588915, + "loss": 1.7471, + "step": 466 + }, + { + "epoch": 0.85, + "grad_norm": 0.05317235738039017, + "learning_rate": 0.00016381940089642893, + "loss": 1.6925, + "step": 467 + }, + { + "epoch": 0.85, + "grad_norm": 0.049223560839891434, + "learning_rate": 0.00016367126372594774, + "loss": 1.7229, + "step": 468 + }, + { + "epoch": 0.86, + "grad_norm": 0.047821756452322006, + "learning_rate": 0.0001635228911920407, + "loss": 1.7484, + "step": 469 + }, + { + "epoch": 0.86, + "grad_norm": 0.05013042315840721, + "learning_rate": 0.00016337428384317288, + "loss": 1.7435, + "step": 470 + }, + { + "epoch": 0.86, + "grad_norm": 0.04820725694298744, + "learning_rate": 0.00016322544222867742, + "loss": 1.7594, + "step": 471 + }, + { + "epoch": 0.86, + "grad_norm": 0.04791193827986717, + "learning_rate": 0.00016307636689875347, + "loss": 1.644, + "step": 472 + }, + { + "epoch": 0.86, + "grad_norm": 0.04905365779995918, + "learning_rate": 0.00016292705840446404, + "loss": 1.7144, + "step": 473 + }, + { + "epoch": 0.86, + "grad_norm": 0.04875028133392334, + "learning_rate": 0.00016277751729773407, + "loss": 1.712, + "step": 474 + }, + { + "epoch": 0.87, + "grad_norm": 0.05170164629817009, + "learning_rate": 0.0001626277441313484, + "loss": 1.7367, + "step": 475 + }, + { + "epoch": 0.87, + "grad_norm": 0.05205371975898743, + "learning_rate": 0.00016247773945894962, + "loss": 1.689, + "step": 476 + }, + { + "epoch": 0.87, + "grad_norm": 0.0485403798520565, + "learning_rate": 0.00016232750383503617, + "loss": 1.706, + "step": 477 + }, + { + "epoch": 0.87, + "grad_norm": 0.0538201630115509, + "learning_rate": 0.0001621770378149601, + "loss": 1.7284, + "step": 478 + }, + { + "epoch": 0.87, + "grad_norm": 0.04828377440571785, + "learning_rate": 0.00016202634195492524, + "loss": 1.661, + "step": 479 + }, + { + "epoch": 0.88, + "grad_norm": 0.050310611724853516, + "learning_rate": 0.000161875416811985, + "loss": 1.6852, + "step": 480 + }, + { + "epoch": 0.88, + "grad_norm": 0.050804853439331055, + "learning_rate": 0.00016172426294404032, + "loss": 1.7358, + "step": 481 + }, + { + "epoch": 0.88, + "grad_norm": 0.051962971687316895, + "learning_rate": 0.00016157288090983763, + "loss": 1.6692, + "step": 482 + }, + { + "epoch": 0.88, + "grad_norm": 0.05179814621806145, + "learning_rate": 0.0001614212712689668, + "loss": 1.6983, + "step": 483 + }, + { + "epoch": 0.88, + "grad_norm": 0.05398216098546982, + "learning_rate": 0.00016126943458185907, + "loss": 1.7261, + "step": 484 + }, + { + "epoch": 0.88, + "grad_norm": 0.049869704991579056, + "learning_rate": 0.00016111737140978494, + "loss": 1.6951, + "step": 485 + }, + { + "epoch": 0.89, + "grad_norm": 0.048107776790857315, + "learning_rate": 0.00016096508231485217, + "loss": 1.6941, + "step": 486 + }, + { + "epoch": 0.89, + "grad_norm": 0.05527656897902489, + "learning_rate": 0.00016081256786000357, + "loss": 1.7054, + "step": 487 + }, + { + "epoch": 0.89, + "grad_norm": 0.05169270187616348, + "learning_rate": 0.00016065982860901504, + "loss": 1.7307, + "step": 488 + }, + { + "epoch": 0.89, + "grad_norm": 0.04972197115421295, + "learning_rate": 0.00016050686512649354, + "loss": 1.6955, + "step": 489 + }, + { + "epoch": 0.89, + "grad_norm": 0.05033208429813385, + "learning_rate": 0.00016035367797787476, + "loss": 1.7013, + "step": 490 + }, + { + "epoch": 0.9, + "grad_norm": 0.05073223263025284, + "learning_rate": 0.00016020026772942125, + "loss": 1.6831, + "step": 491 + }, + { + "epoch": 0.9, + "grad_norm": 0.056367356330156326, + "learning_rate": 0.00016004663494822028, + "loss": 1.6654, + "step": 492 + }, + { + "epoch": 0.9, + "grad_norm": 0.049483008682727814, + "learning_rate": 0.0001598927802021817, + "loss": 1.7285, + "step": 493 + }, + { + "epoch": 0.9, + "grad_norm": 0.052070703357458115, + "learning_rate": 0.00015973870406003578, + "loss": 1.7948, + "step": 494 + }, + { + "epoch": 0.9, + "grad_norm": 0.05687413364648819, + "learning_rate": 0.0001595844070913314, + "loss": 1.7336, + "step": 495 + }, + { + "epoch": 0.9, + "grad_norm": 0.048987727612257004, + "learning_rate": 0.00015942988986643352, + "loss": 1.6661, + "step": 496 + }, + { + "epoch": 0.91, + "grad_norm": 0.05027730017900467, + "learning_rate": 0.00015927515295652143, + "loss": 1.7364, + "step": 497 + }, + { + "epoch": 0.91, + "grad_norm": 0.048406291753053665, + "learning_rate": 0.00015912019693358636, + "loss": 1.6419, + "step": 498 + }, + { + "epoch": 0.91, + "grad_norm": 0.05071192979812622, + "learning_rate": 0.00015896502237042963, + "loss": 1.6301, + "step": 499 + }, + { + "epoch": 0.91, + "grad_norm": 0.05111885070800781, + "learning_rate": 0.00015880962984066036, + "loss": 1.7112, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 0.06297910958528519, + "learning_rate": 0.0001586540199186933, + "loss": 1.7438, + "step": 501 + }, + { + "epoch": 0.92, + "grad_norm": 0.04950469359755516, + "learning_rate": 0.00015849819317974694, + "loss": 1.6837, + "step": 502 + }, + { + "epoch": 0.92, + "grad_norm": 0.04900701716542244, + "learning_rate": 0.0001583421501998412, + "loss": 1.7432, + "step": 503 + }, + { + "epoch": 0.92, + "grad_norm": 0.04949019104242325, + "learning_rate": 0.0001581858915557953, + "loss": 1.688, + "step": 504 + }, + { + "epoch": 0.92, + "grad_norm": 0.05047097057104111, + "learning_rate": 0.00015802941782522569, + "loss": 1.7256, + "step": 505 + }, + { + "epoch": 0.92, + "grad_norm": 0.04921870306134224, + "learning_rate": 0.0001578727295865439, + "loss": 1.7723, + "step": 506 + }, + { + "epoch": 0.92, + "grad_norm": 0.04841122031211853, + "learning_rate": 0.0001577158274189544, + "loss": 1.71, + "step": 507 + }, + { + "epoch": 0.93, + "grad_norm": 0.04886234924197197, + "learning_rate": 0.00015755871190245251, + "loss": 1.6622, + "step": 508 + }, + { + "epoch": 0.93, + "grad_norm": 0.04966573417186737, + "learning_rate": 0.00015740138361782207, + "loss": 1.7357, + "step": 509 + }, + { + "epoch": 0.93, + "grad_norm": 0.050070296972990036, + "learning_rate": 0.0001572438431466336, + "loss": 1.6803, + "step": 510 + }, + { + "epoch": 0.93, + "grad_norm": 0.054121073335409164, + "learning_rate": 0.00015708609107124177, + "loss": 1.7659, + "step": 511 + }, + { + "epoch": 0.93, + "grad_norm": 0.05084529519081116, + "learning_rate": 0.00015692812797478368, + "loss": 1.6943, + "step": 512 + }, + { + "epoch": 0.94, + "grad_norm": 0.056926507502794266, + "learning_rate": 0.0001567699544411763, + "loss": 1.6562, + "step": 513 + }, + { + "epoch": 0.94, + "grad_norm": 0.05053721368312836, + "learning_rate": 0.00015661157105511457, + "loss": 1.7624, + "step": 514 + }, + { + "epoch": 0.94, + "grad_norm": 0.048727016896009445, + "learning_rate": 0.00015645297840206915, + "loss": 1.7364, + "step": 515 + }, + { + "epoch": 0.94, + "grad_norm": 0.051376283168792725, + "learning_rate": 0.00015629417706828423, + "loss": 1.699, + "step": 516 + }, + { + "epoch": 0.94, + "grad_norm": 0.05029591917991638, + "learning_rate": 0.00015613516764077548, + "loss": 1.6972, + "step": 517 + }, + { + "epoch": 0.94, + "grad_norm": 0.053968969732522964, + "learning_rate": 0.00015597595070732765, + "loss": 1.7128, + "step": 518 + }, + { + "epoch": 0.95, + "grad_norm": 0.050694871693849564, + "learning_rate": 0.00015581652685649276, + "loss": 1.7681, + "step": 519 + }, + { + "epoch": 0.95, + "grad_norm": 0.052369993180036545, + "learning_rate": 0.00015565689667758746, + "loss": 1.7321, + "step": 520 + }, + { + "epoch": 0.95, + "grad_norm": 0.04850650206208229, + "learning_rate": 0.00015549706076069128, + "loss": 1.7162, + "step": 521 + }, + { + "epoch": 0.95, + "grad_norm": 0.04979635775089264, + "learning_rate": 0.00015533701969664424, + "loss": 1.7429, + "step": 522 + }, + { + "epoch": 0.95, + "grad_norm": 0.04920853301882744, + "learning_rate": 0.0001551767740770446, + "loss": 1.7103, + "step": 523 + }, + { + "epoch": 0.96, + "grad_norm": 0.05081456899642944, + "learning_rate": 0.0001550163244942469, + "loss": 1.7781, + "step": 524 + }, + { + "epoch": 0.96, + "grad_norm": 0.050754062831401825, + "learning_rate": 0.00015485567154135952, + "loss": 1.7496, + "step": 525 + }, + { + "epoch": 0.96, + "grad_norm": 0.050315603613853455, + "learning_rate": 0.00015469481581224272, + "loss": 1.7303, + "step": 526 + }, + { + "epoch": 0.96, + "grad_norm": 0.05050061643123627, + "learning_rate": 0.00015453375790150617, + "loss": 1.679, + "step": 527 + }, + { + "epoch": 0.96, + "grad_norm": 0.06212810054421425, + "learning_rate": 0.00015437249840450715, + "loss": 1.713, + "step": 528 + }, + { + "epoch": 0.96, + "grad_norm": 0.050966355949640274, + "learning_rate": 0.00015421103791734786, + "loss": 1.7551, + "step": 529 + }, + { + "epoch": 0.97, + "grad_norm": 0.04892159253358841, + "learning_rate": 0.00015404937703687363, + "loss": 1.6758, + "step": 530 + }, + { + "epoch": 0.97, + "grad_norm": 0.05551762133836746, + "learning_rate": 0.00015388751636067052, + "loss": 1.703, + "step": 531 + }, + { + "epoch": 0.97, + "grad_norm": 0.0516047477722168, + "learning_rate": 0.00015372545648706306, + "loss": 1.7407, + "step": 532 + }, + { + "epoch": 0.97, + "grad_norm": 0.05094458907842636, + "learning_rate": 0.0001535631980151123, + "loss": 1.6534, + "step": 533 + }, + { + "epoch": 0.97, + "grad_norm": 0.05045678839087486, + "learning_rate": 0.00015340074154461316, + "loss": 1.7335, + "step": 534 + }, + { + "epoch": 0.98, + "grad_norm": 0.05067756026983261, + "learning_rate": 0.00015323808767609277, + "loss": 1.7169, + "step": 535 + }, + { + "epoch": 0.98, + "grad_norm": 0.05005278438329697, + "learning_rate": 0.00015307523701080768, + "loss": 1.7778, + "step": 536 + }, + { + "epoch": 0.98, + "grad_norm": 0.04952746629714966, + "learning_rate": 0.0001529121901507421, + "loss": 1.7199, + "step": 537 + }, + { + "epoch": 0.98, + "grad_norm": 0.04711218178272247, + "learning_rate": 0.00015274894769860538, + "loss": 1.734, + "step": 538 + }, + { + "epoch": 0.98, + "grad_norm": 0.05313078686594963, + "learning_rate": 0.0001525855102578299, + "loss": 1.7733, + "step": 539 + }, + { + "epoch": 0.98, + "grad_norm": 0.04977120831608772, + "learning_rate": 0.0001524218784325688, + "loss": 1.731, + "step": 540 + }, + { + "epoch": 0.99, + "grad_norm": 0.05076899752020836, + "learning_rate": 0.00015225805282769383, + "loss": 1.7277, + "step": 541 + }, + { + "epoch": 0.99, + "grad_norm": 0.049164701253175735, + "learning_rate": 0.00015209403404879303, + "loss": 1.7032, + "step": 542 + }, + { + "epoch": 0.99, + "grad_norm": 0.0488349013030529, + "learning_rate": 0.00015192982270216854, + "loss": 1.765, + "step": 543 + }, + { + "epoch": 0.99, + "grad_norm": 0.04831582307815552, + "learning_rate": 0.0001517654193948343, + "loss": 1.7548, + "step": 544 + }, + { + "epoch": 0.99, + "grad_norm": 0.052940741181373596, + "learning_rate": 0.00015160082473451378, + "loss": 1.7209, + "step": 545 + }, + { + "epoch": 1.0, + "grad_norm": 0.056908875703811646, + "learning_rate": 0.00015143603932963795, + "loss": 1.6537, + "step": 546 + }, + { + "epoch": 1.0, + "grad_norm": 0.0509711354970932, + "learning_rate": 0.00015127106378934273, + "loss": 1.7151, + "step": 547 + }, + { + "epoch": 1.0, + "grad_norm": 0.04795239865779877, + "learning_rate": 0.000151105898723467, + "loss": 1.743, + "step": 548 + }, + { + "epoch": 1.0, + "eval_loss": 1.7236659526824951, + "eval_runtime": 76.6784, + "eval_samples_per_second": 65.207, + "eval_steps_per_second": 16.302, + "step": 548 + }, + { + "epoch": 1.0, + "grad_norm": 0.05828290060162544, + "learning_rate": 0.00015094054474255007, + "loss": 1.7014, + "step": 549 + }, + { + "epoch": 1.0, + "grad_norm": 0.04827438294887543, + "learning_rate": 0.00015077500245782978, + "loss": 1.7124, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 0.04962700232863426, + "learning_rate": 0.0001506092724812399, + "loss": 1.7496, + "step": 551 + }, + { + "epoch": 1.01, + "grad_norm": 0.05015181377530098, + "learning_rate": 0.00015044335542540804, + "loss": 1.6653, + "step": 552 + }, + { + "epoch": 1.01, + "grad_norm": 0.07125337421894073, + "learning_rate": 0.0001502772519036534, + "loss": 1.6938, + "step": 553 + }, + { + "epoch": 1.01, + "grad_norm": 0.05031266435980797, + "learning_rate": 0.0001501109625299844, + "loss": 1.7782, + "step": 554 + }, + { + "epoch": 1.01, + "grad_norm": 0.0487028993666172, + "learning_rate": 0.00014994448791909656, + "loss": 1.7202, + "step": 555 + }, + { + "epoch": 1.0, + "grad_norm": 0.06726840883493423, + "learning_rate": 0.00014977782868636999, + "loss": 1.7504, + "step": 556 + }, + { + "epoch": 1.0, + "grad_norm": 0.06244590878486633, + "learning_rate": 0.00014961098544786743, + "loss": 1.6834, + "step": 557 + }, + { + "epoch": 1.01, + "grad_norm": 0.04934772849082947, + "learning_rate": 0.00014944395882033167, + "loss": 1.6822, + "step": 558 + }, + { + "epoch": 1.01, + "grad_norm": 0.050311822444200516, + "learning_rate": 0.00014927674942118345, + "loss": 1.747, + "step": 559 + }, + { + "epoch": 1.01, + "grad_norm": 0.051862068474292755, + "learning_rate": 0.00014910935786851919, + "loss": 1.7355, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 0.049238841980695724, + "learning_rate": 0.00014894178478110857, + "loss": 1.6973, + "step": 561 + }, + { + "epoch": 1.01, + "grad_norm": 0.05033009499311447, + "learning_rate": 0.00014877403077839235, + "loss": 1.6718, + "step": 562 + }, + { + "epoch": 1.01, + "grad_norm": 0.04922296851873398, + "learning_rate": 0.00014860609648048004, + "loss": 1.7236, + "step": 563 + }, + { + "epoch": 1.02, + "grad_norm": 0.05257139354944229, + "learning_rate": 0.0001484379825081476, + "loss": 1.6868, + "step": 564 + }, + { + "epoch": 1.02, + "grad_norm": 0.05213212966918945, + "learning_rate": 0.0001482696894828353, + "loss": 1.726, + "step": 565 + }, + { + "epoch": 1.02, + "grad_norm": 0.053737424314022064, + "learning_rate": 0.00014810121802664512, + "loss": 1.7046, + "step": 566 + }, + { + "epoch": 1.02, + "grad_norm": 0.054125770926475525, + "learning_rate": 0.0001479325687623386, + "loss": 1.6106, + "step": 567 + }, + { + "epoch": 1.02, + "grad_norm": 0.051876723766326904, + "learning_rate": 0.00014776374231333477, + "loss": 1.7354, + "step": 568 + }, + { + "epoch": 1.03, + "grad_norm": 0.050595056265592575, + "learning_rate": 0.00014759473930370736, + "loss": 1.6947, + "step": 569 + }, + { + "epoch": 1.03, + "grad_norm": 0.06360866129398346, + "learning_rate": 0.00014742556035818297, + "loss": 1.7379, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 0.05476611480116844, + "learning_rate": 0.0001472562061021385, + "loss": 1.6392, + "step": 571 + }, + { + "epoch": 1.03, + "grad_norm": 0.051338374614715576, + "learning_rate": 0.0001470866771615988, + "loss": 1.687, + "step": 572 + }, + { + "epoch": 1.03, + "grad_norm": 0.05180288851261139, + "learning_rate": 0.00014691697416323454, + "loss": 1.6942, + "step": 573 + }, + { + "epoch": 1.03, + "grad_norm": 0.05175211653113365, + "learning_rate": 0.00014674709773435983, + "loss": 1.6648, + "step": 574 + }, + { + "epoch": 1.04, + "grad_norm": 0.055275119841098785, + "learning_rate": 0.00014657704850292976, + "loss": 1.7311, + "step": 575 + }, + { + "epoch": 1.04, + "grad_norm": 0.053508460521698, + "learning_rate": 0.00014640682709753832, + "loss": 1.7118, + "step": 576 + }, + { + "epoch": 1.04, + "grad_norm": 0.05283378064632416, + "learning_rate": 0.00014623643414741585, + "loss": 1.6675, + "step": 577 + }, + { + "epoch": 1.04, + "grad_norm": 0.05684136226773262, + "learning_rate": 0.00014606587028242682, + "loss": 1.709, + "step": 578 + }, + { + "epoch": 1.04, + "grad_norm": 0.0515415295958519, + "learning_rate": 0.0001458951361330676, + "loss": 1.653, + "step": 579 + }, + { + "epoch": 1.05, + "grad_norm": 0.052131347358226776, + "learning_rate": 0.00014572423233046386, + "loss": 1.6497, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 0.05229787901043892, + "learning_rate": 0.00014555315950636854, + "loss": 1.6209, + "step": 581 + }, + { + "epoch": 1.05, + "grad_norm": 0.058796849101781845, + "learning_rate": 0.00014538191829315927, + "loss": 1.6907, + "step": 582 + }, + { + "epoch": 1.05, + "grad_norm": 0.0535275973379612, + "learning_rate": 0.00014521050932383625, + "loss": 1.6765, + "step": 583 + }, + { + "epoch": 1.05, + "grad_norm": 0.06131954491138458, + "learning_rate": 0.00014503893323201966, + "loss": 1.6963, + "step": 584 + }, + { + "epoch": 1.05, + "grad_norm": 0.05318441987037659, + "learning_rate": 0.00014486719065194757, + "loss": 1.6693, + "step": 585 + }, + { + "epoch": 1.06, + "grad_norm": 0.053547151386737823, + "learning_rate": 0.00014469528221847344, + "loss": 1.6265, + "step": 586 + }, + { + "epoch": 1.06, + "grad_norm": 0.05694759264588356, + "learning_rate": 0.00014452320856706382, + "loss": 1.6998, + "step": 587 + }, + { + "epoch": 1.06, + "grad_norm": 0.053848620504140854, + "learning_rate": 0.00014435097033379596, + "loss": 1.7248, + "step": 588 + }, + { + "epoch": 1.06, + "grad_norm": 0.05272265151143074, + "learning_rate": 0.00014417856815535554, + "loss": 1.6973, + "step": 589 + }, + { + "epoch": 1.06, + "grad_norm": 0.05548195540904999, + "learning_rate": 0.00014400600266903423, + "loss": 1.6912, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 0.05391455814242363, + "learning_rate": 0.00014383327451272744, + "loss": 1.6507, + "step": 591 + }, + { + "epoch": 1.07, + "grad_norm": 0.05697217211127281, + "learning_rate": 0.00014366038432493181, + "loss": 1.7277, + "step": 592 + }, + { + "epoch": 1.07, + "grad_norm": 0.054713811725378036, + "learning_rate": 0.000143487332744743, + "loss": 1.7225, + "step": 593 + }, + { + "epoch": 1.07, + "grad_norm": 0.05515265092253685, + "learning_rate": 0.00014331412041185322, + "loss": 1.6838, + "step": 594 + }, + { + "epoch": 1.07, + "grad_norm": 0.054941218346357346, + "learning_rate": 0.00014314074796654896, + "loss": 1.6913, + "step": 595 + }, + { + "epoch": 1.07, + "grad_norm": 0.05448353663086891, + "learning_rate": 0.0001429672160497085, + "loss": 1.6685, + "step": 596 + }, + { + "epoch": 1.08, + "grad_norm": 0.058499112725257874, + "learning_rate": 0.0001427935253027997, + "loss": 1.6637, + "step": 597 + }, + { + "epoch": 1.08, + "grad_norm": 0.0628763735294342, + "learning_rate": 0.00014261967636787747, + "loss": 1.7139, + "step": 598 + }, + { + "epoch": 1.08, + "grad_norm": 0.05447819083929062, + "learning_rate": 0.00014244566988758152, + "loss": 1.6984, + "step": 599 + }, + { + "epoch": 1.08, + "grad_norm": 0.05434316396713257, + "learning_rate": 0.0001422715065051339, + "loss": 1.6688, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 0.052557747811079025, + "learning_rate": 0.00014209718686433663, + "loss": 1.7169, + "step": 601 + }, + { + "epoch": 1.09, + "grad_norm": 0.054510824382305145, + "learning_rate": 0.00014192271160956942, + "loss": 1.6186, + "step": 602 + }, + { + "epoch": 1.09, + "grad_norm": 0.0586363822221756, + "learning_rate": 0.00014174808138578713, + "loss": 1.7364, + "step": 603 + }, + { + "epoch": 1.09, + "grad_norm": 0.05653434619307518, + "learning_rate": 0.0001415732968385176, + "loss": 1.77, + "step": 604 + }, + { + "epoch": 1.09, + "grad_norm": 0.052821431308984756, + "learning_rate": 0.00014139835861385892, + "loss": 1.6599, + "step": 605 + }, + { + "epoch": 1.09, + "grad_norm": 0.054437246173620224, + "learning_rate": 0.00014122326735847748, + "loss": 1.7026, + "step": 606 + }, + { + "epoch": 1.09, + "grad_norm": 0.056837234646081924, + "learning_rate": 0.00014104802371960523, + "loss": 1.6475, + "step": 607 + }, + { + "epoch": 1.1, + "grad_norm": 0.06032341718673706, + "learning_rate": 0.0001408726283450374, + "loss": 1.7482, + "step": 608 + }, + { + "epoch": 1.1, + "grad_norm": 0.05582507699728012, + "learning_rate": 0.00014069708188313017, + "loss": 1.7046, + "step": 609 + }, + { + "epoch": 1.1, + "grad_norm": 0.05785200744867325, + "learning_rate": 0.00014052138498279828, + "loss": 1.7234, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 0.05540376156568527, + "learning_rate": 0.00014034553829351236, + "loss": 1.7157, + "step": 611 + }, + { + "epoch": 1.1, + "grad_norm": 0.05743914842605591, + "learning_rate": 0.00014016954246529696, + "loss": 1.7548, + "step": 612 + }, + { + "epoch": 1.11, + "grad_norm": 0.05496819317340851, + "learning_rate": 0.00013999339814872784, + "loss": 1.6913, + "step": 613 + }, + { + "epoch": 1.11, + "grad_norm": 0.05739595368504524, + "learning_rate": 0.00013981710599492964, + "loss": 1.7232, + "step": 614 + }, + { + "epoch": 1.11, + "grad_norm": 0.05653569847345352, + "learning_rate": 0.00013964066665557348, + "loss": 1.6953, + "step": 615 + }, + { + "epoch": 1.11, + "grad_norm": 0.05570907145738602, + "learning_rate": 0.00013946408078287462, + "loss": 1.6858, + "step": 616 + }, + { + "epoch": 1.11, + "grad_norm": 0.054925207048654556, + "learning_rate": 0.00013928734902958996, + "loss": 1.6248, + "step": 617 + }, + { + "epoch": 1.11, + "grad_norm": 0.05743985250592232, + "learning_rate": 0.0001391104720490156, + "loss": 1.6627, + "step": 618 + }, + { + "epoch": 1.12, + "grad_norm": 0.05516685172915459, + "learning_rate": 0.00013893345049498457, + "loss": 1.6714, + "step": 619 + }, + { + "epoch": 1.12, + "grad_norm": 0.05717911571264267, + "learning_rate": 0.0001387562850218642, + "loss": 1.7124, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 0.05529535561800003, + "learning_rate": 0.00013857897628455397, + "loss": 1.6451, + "step": 621 + }, + { + "epoch": 1.12, + "grad_norm": 0.05724070221185684, + "learning_rate": 0.00013840152493848284, + "loss": 1.7274, + "step": 622 + }, + { + "epoch": 1.12, + "grad_norm": 0.05622214823961258, + "learning_rate": 0.0001382239316396069, + "loss": 1.6506, + "step": 623 + }, + { + "epoch": 1.13, + "grad_norm": 0.05893300846219063, + "learning_rate": 0.00013804619704440714, + "loss": 1.7037, + "step": 624 + }, + { + "epoch": 1.13, + "grad_norm": 0.05549685284495354, + "learning_rate": 0.00013786832180988666, + "loss": 1.6894, + "step": 625 + }, + { + "epoch": 1.13, + "grad_norm": 0.05931728705763817, + "learning_rate": 0.00013769030659356853, + "loss": 1.7189, + "step": 626 + }, + { + "epoch": 1.13, + "grad_norm": 0.05465949699282646, + "learning_rate": 0.0001375121520534933, + "loss": 1.7016, + "step": 627 + }, + { + "epoch": 1.13, + "grad_norm": 0.056453317403793335, + "learning_rate": 0.00013733385884821648, + "loss": 1.6711, + "step": 628 + }, + { + "epoch": 1.13, + "grad_norm": 0.054540056735277176, + "learning_rate": 0.00013715542763680623, + "loss": 1.6638, + "step": 629 + }, + { + "epoch": 1.14, + "grad_norm": 0.05919068679213524, + "learning_rate": 0.00013697685907884072, + "loss": 1.7241, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 0.05730579420924187, + "learning_rate": 0.00013679815383440603, + "loss": 1.6946, + "step": 631 + }, + { + "epoch": 1.14, + "grad_norm": 0.05658195540308952, + "learning_rate": 0.00013661931256409325, + "loss": 1.7038, + "step": 632 + }, + { + "epoch": 1.14, + "grad_norm": 0.057528719305992126, + "learning_rate": 0.00013644033592899658, + "loss": 1.6853, + "step": 633 + }, + { + "epoch": 1.14, + "grad_norm": 0.062490735203027725, + "learning_rate": 0.00013626122459071033, + "loss": 1.6733, + "step": 634 + }, + { + "epoch": 1.15, + "grad_norm": 0.05776170268654823, + "learning_rate": 0.00013608197921132696, + "loss": 1.7351, + "step": 635 + }, + { + "epoch": 1.15, + "grad_norm": 0.06134483963251114, + "learning_rate": 0.00013590260045343432, + "loss": 1.6203, + "step": 636 + }, + { + "epoch": 1.15, + "grad_norm": 0.061270635575056076, + "learning_rate": 0.0001357230889801133, + "loss": 1.7268, + "step": 637 + }, + { + "epoch": 1.15, + "grad_norm": 0.056105442345142365, + "learning_rate": 0.00013554344545493535, + "loss": 1.7171, + "step": 638 + }, + { + "epoch": 1.15, + "grad_norm": 0.05647943168878555, + "learning_rate": 0.0001353636705419602, + "loss": 1.713, + "step": 639 + }, + { + "epoch": 1.15, + "grad_norm": 0.05758386850357056, + "learning_rate": 0.00013518376490573306, + "loss": 1.6991, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 0.05906842276453972, + "learning_rate": 0.0001350037292112825, + "loss": 1.6387, + "step": 641 + }, + { + "epoch": 1.16, + "grad_norm": 0.06219753623008728, + "learning_rate": 0.00013482356412411781, + "loss": 1.7145, + "step": 642 + }, + { + "epoch": 1.16, + "grad_norm": 0.05719519779086113, + "learning_rate": 0.00013464327031022659, + "loss": 1.7399, + "step": 643 + }, + { + "epoch": 1.16, + "grad_norm": 0.08058752119541168, + "learning_rate": 0.00013446284843607225, + "loss": 1.6275, + "step": 644 + }, + { + "epoch": 1.16, + "grad_norm": 0.06629724055528641, + "learning_rate": 0.00013428229916859167, + "loss": 1.6582, + "step": 645 + }, + { + "epoch": 1.17, + "grad_norm": 0.05791241303086281, + "learning_rate": 0.00013410162317519257, + "loss": 1.6599, + "step": 646 + }, + { + "epoch": 1.17, + "grad_norm": 0.06143872067332268, + "learning_rate": 0.0001339208211237511, + "loss": 1.6634, + "step": 647 + }, + { + "epoch": 1.17, + "grad_norm": 0.06067274510860443, + "learning_rate": 0.00013373989368260948, + "loss": 1.6869, + "step": 648 + }, + { + "epoch": 1.17, + "grad_norm": 0.06446303427219391, + "learning_rate": 0.00013355884152057334, + "loss": 1.6658, + "step": 649 + }, + { + "epoch": 1.17, + "grad_norm": 0.05910011753439903, + "learning_rate": 0.00013337766530690943, + "loss": 1.683, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 0.06423602253198624, + "learning_rate": 0.00013319636571134297, + "loss": 1.7058, + "step": 651 + }, + { + "epoch": 1.18, + "grad_norm": 0.05743340775370598, + "learning_rate": 0.00013301494340405535, + "loss": 1.6491, + "step": 652 + }, + { + "epoch": 1.18, + "grad_norm": 0.05755629763007164, + "learning_rate": 0.00013283339905568157, + "loss": 1.6606, + "step": 653 + }, + { + "epoch": 1.18, + "grad_norm": 0.05766105651855469, + "learning_rate": 0.00013265173333730764, + "loss": 1.6855, + "step": 654 + }, + { + "epoch": 1.18, + "grad_norm": 0.05892917141318321, + "learning_rate": 0.00013246994692046836, + "loss": 1.6398, + "step": 655 + }, + { + "epoch": 1.18, + "grad_norm": 0.05860791355371475, + "learning_rate": 0.00013228804047714463, + "loss": 1.7089, + "step": 656 + }, + { + "epoch": 1.19, + "grad_norm": 0.059190504252910614, + "learning_rate": 0.00013210601467976104, + "loss": 1.6703, + "step": 657 + }, + { + "epoch": 1.19, + "grad_norm": 0.05735331028699875, + "learning_rate": 0.0001319238702011834, + "loss": 1.73, + "step": 658 + }, + { + "epoch": 1.19, + "grad_norm": 0.05985163152217865, + "learning_rate": 0.0001317416077147162, + "loss": 1.6864, + "step": 659 + }, + { + "epoch": 1.19, + "grad_norm": 0.05826161056756973, + "learning_rate": 0.00013155922789410016, + "loss": 1.6419, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 0.059993255883455276, + "learning_rate": 0.00013137673141350972, + "loss": 1.7027, + "step": 661 + }, + { + "epoch": 1.19, + "grad_norm": 0.06040223315358162, + "learning_rate": 0.00013119411894755063, + "loss": 1.7584, + "step": 662 + }, + { + "epoch": 1.2, + "grad_norm": 0.056883446872234344, + "learning_rate": 0.00013101139117125722, + "loss": 1.6971, + "step": 663 + }, + { + "epoch": 1.2, + "grad_norm": 0.05828433483839035, + "learning_rate": 0.0001308285487600903, + "loss": 1.6797, + "step": 664 + }, + { + "epoch": 1.2, + "grad_norm": 0.0568573996424675, + "learning_rate": 0.0001306455923899342, + "loss": 1.6967, + "step": 665 + }, + { + "epoch": 1.2, + "grad_norm": 0.05763811990618706, + "learning_rate": 0.00013046252273709468, + "loss": 1.7189, + "step": 666 + }, + { + "epoch": 1.2, + "grad_norm": 0.05759183317422867, + "learning_rate": 0.00013027934047829616, + "loss": 1.7293, + "step": 667 + }, + { + "epoch": 1.21, + "grad_norm": 0.06087080016732216, + "learning_rate": 0.00013009604629067933, + "loss": 1.7287, + "step": 668 + }, + { + "epoch": 1.21, + "grad_norm": 0.05685460940003395, + "learning_rate": 0.00012991264085179864, + "loss": 1.6717, + "step": 669 + }, + { + "epoch": 1.21, + "grad_norm": 0.06102333217859268, + "learning_rate": 0.00012972912483961982, + "loss": 1.7911, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 0.05811255797743797, + "learning_rate": 0.00012954549893251724, + "loss": 1.7057, + "step": 671 + }, + { + "epoch": 1.21, + "grad_norm": 0.05935278907418251, + "learning_rate": 0.00012936176380927162, + "loss": 1.6678, + "step": 672 + }, + { + "epoch": 1.21, + "grad_norm": 0.06539764255285263, + "learning_rate": 0.00012917792014906733, + "loss": 1.6305, + "step": 673 + }, + { + "epoch": 1.22, + "grad_norm": 0.059705205261707306, + "learning_rate": 0.00012899396863148995, + "loss": 1.7273, + "step": 674 + }, + { + "epoch": 1.22, + "grad_norm": 0.05784007906913757, + "learning_rate": 0.00012880990993652377, + "loss": 1.6549, + "step": 675 + }, + { + "epoch": 1.22, + "grad_norm": 0.07344791293144226, + "learning_rate": 0.00012862574474454928, + "loss": 1.6809, + "step": 676 + }, + { + "epoch": 1.22, + "grad_norm": 0.06028100103139877, + "learning_rate": 0.00012844147373634066, + "loss": 1.6852, + "step": 677 + }, + { + "epoch": 1.22, + "grad_norm": 0.06096576154232025, + "learning_rate": 0.00012825709759306316, + "loss": 1.7256, + "step": 678 + }, + { + "epoch": 1.23, + "grad_norm": 0.060117993503808975, + "learning_rate": 0.00012807261699627077, + "loss": 1.7094, + "step": 679 + }, + { + "epoch": 1.23, + "grad_norm": 0.06428851187229156, + "learning_rate": 0.0001278880326279035, + "loss": 1.6538, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 0.060511935502290726, + "learning_rate": 0.00012770334517028505, + "loss": 1.6631, + "step": 681 + }, + { + "epoch": 1.23, + "grad_norm": 0.05897079408168793, + "learning_rate": 0.00012751855530612012, + "loss": 1.6732, + "step": 682 + }, + { + "epoch": 1.23, + "grad_norm": 0.05949567258358002, + "learning_rate": 0.00012733366371849201, + "loss": 1.6989, + "step": 683 + }, + { + "epoch": 1.23, + "grad_norm": 0.05985894054174423, + "learning_rate": 0.00012714867109086, + "loss": 1.6983, + "step": 684 + }, + { + "epoch": 1.24, + "grad_norm": 0.061160728335380554, + "learning_rate": 0.0001269635781070569, + "loss": 1.7075, + "step": 685 + }, + { + "epoch": 1.24, + "eval_loss": 1.7264653444290161, + "eval_runtime": 76.4445, + "eval_samples_per_second": 65.407, + "eval_steps_per_second": 16.352, + "step": 685 + }, + { + "epoch": 1.24, + "grad_norm": 0.0652250349521637, + "learning_rate": 0.00012677838545128647, + "loss": 1.6851, + "step": 686 + }, + { + "epoch": 1.24, + "grad_norm": 0.060404662042856216, + "learning_rate": 0.00012659309380812092, + "loss": 1.6539, + "step": 687 + }, + { + "epoch": 1.24, + "grad_norm": 0.05635406821966171, + "learning_rate": 0.0001264077038624984, + "loss": 1.678, + "step": 688 + }, + { + "epoch": 1.24, + "grad_norm": 0.06129194051027298, + "learning_rate": 0.00012622221629972043, + "loss": 1.6455, + "step": 689 + }, + { + "epoch": 1.25, + "grad_norm": 0.06195101514458656, + "learning_rate": 0.0001260366318054493, + "loss": 1.7009, + "step": 690 + }, + { + "epoch": 1.25, + "grad_norm": 0.06593389809131622, + "learning_rate": 0.0001258509510657057, + "loss": 1.6897, + "step": 691 + }, + { + "epoch": 1.25, + "grad_norm": 0.0664474368095398, + "learning_rate": 0.00012566517476686606, + "loss": 1.6847, + "step": 692 + }, + { + "epoch": 1.25, + "grad_norm": 0.06081750988960266, + "learning_rate": 0.00012547930359566007, + "loss": 1.6126, + "step": 693 + }, + { + "epoch": 1.25, + "grad_norm": 0.06048804894089699, + "learning_rate": 0.00012529333823916807, + "loss": 1.7086, + "step": 694 + }, + { + "epoch": 1.25, + "grad_norm": 0.06522712111473083, + "learning_rate": 0.00012510727938481865, + "loss": 1.6931, + "step": 695 + }, + { + "epoch": 1.26, + "grad_norm": 0.0614117830991745, + "learning_rate": 0.0001249211277203859, + "loss": 1.7362, + "step": 696 + }, + { + "epoch": 1.26, + "grad_norm": 0.05812584608793259, + "learning_rate": 0.00012473488393398706, + "loss": 1.7052, + "step": 697 + }, + { + "epoch": 1.26, + "grad_norm": 0.059068553149700165, + "learning_rate": 0.00012454854871407994, + "loss": 1.6872, + "step": 698 + }, + { + "epoch": 1.26, + "grad_norm": 0.06033525615930557, + "learning_rate": 0.0001243621227494602, + "loss": 1.6954, + "step": 699 + }, + { + "epoch": 1.26, + "grad_norm": 0.06032804027199745, + "learning_rate": 0.00012417560672925912, + "loss": 1.6571, + "step": 700 + }, + { + "epoch": 1.27, + "grad_norm": 0.06035863235592842, + "learning_rate": 0.00012398900134294073, + "loss": 1.6894, + "step": 701 + }, + { + "epoch": 1.27, + "grad_norm": 0.059223804622888565, + "learning_rate": 0.00012380230728029946, + "loss": 1.711, + "step": 702 + }, + { + "epoch": 1.27, + "grad_norm": 0.061139173805713654, + "learning_rate": 0.00012361552523145757, + "loss": 1.626, + "step": 703 + }, + { + "epoch": 1.27, + "grad_norm": 0.06459489464759827, + "learning_rate": 0.0001234286558868625, + "loss": 1.7467, + "step": 704 + }, + { + "epoch": 1.27, + "grad_norm": 0.06497075408697128, + "learning_rate": 0.00012324169993728438, + "loss": 1.7419, + "step": 705 + }, + { + "epoch": 1.27, + "grad_norm": 0.06115833297371864, + "learning_rate": 0.0001230546580738136, + "loss": 1.6781, + "step": 706 + }, + { + "epoch": 1.28, + "grad_norm": 0.06160350516438484, + "learning_rate": 0.00012286753098785796, + "loss": 1.6907, + "step": 707 + }, + { + "epoch": 1.28, + "grad_norm": 0.06168088689446449, + "learning_rate": 0.00012268031937114044, + "loss": 1.7265, + "step": 708 + }, + { + "epoch": 1.28, + "grad_norm": 0.06278149783611298, + "learning_rate": 0.00012249302391569638, + "loss": 1.7023, + "step": 709 + }, + { + "epoch": 1.28, + "grad_norm": 0.06181812658905983, + "learning_rate": 0.00012230564531387107, + "loss": 1.6897, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 0.05875727906823158, + "learning_rate": 0.00012211818425831718, + "loss": 1.644, + "step": 711 + }, + { + "epoch": 1.29, + "grad_norm": 0.061242878437042236, + "learning_rate": 0.00012193064144199218, + "loss": 1.7256, + "step": 712 + }, + { + "epoch": 1.29, + "grad_norm": 0.060726381838321686, + "learning_rate": 0.00012174301755815571, + "loss": 1.6871, + "step": 713 + }, + { + "epoch": 1.29, + "grad_norm": 0.06219150498509407, + "learning_rate": 0.00012155531330036712, + "loss": 1.7048, + "step": 714 + }, + { + "epoch": 1.29, + "grad_norm": 0.06084437295794487, + "learning_rate": 0.0001213675293624829, + "loss": 1.6888, + "step": 715 + }, + { + "epoch": 1.29, + "grad_norm": 0.06178005784749985, + "learning_rate": 0.00012117966643865398, + "loss": 1.6791, + "step": 716 + }, + { + "epoch": 1.29, + "grad_norm": 0.05991113558411598, + "learning_rate": 0.00012099172522332338, + "loss": 1.7318, + "step": 717 + }, + { + "epoch": 1.3, + "grad_norm": 0.06223401054739952, + "learning_rate": 0.00012080370641122345, + "loss": 1.6417, + "step": 718 + }, + { + "epoch": 1.3, + "grad_norm": 0.062392983585596085, + "learning_rate": 0.00012061561069737343, + "loss": 1.6411, + "step": 719 + }, + { + "epoch": 1.3, + "grad_norm": 0.060492224991321564, + "learning_rate": 0.00012042743877707678, + "loss": 1.6717, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 0.06418413668870926, + "learning_rate": 0.0001202391913459187, + "loss": 1.6751, + "step": 721 + }, + { + "epoch": 1.3, + "grad_norm": 0.060530129820108414, + "learning_rate": 0.0001200508690997635, + "loss": 1.7175, + "step": 722 + }, + { + "epoch": 1.31, + "grad_norm": 0.06409049779176712, + "learning_rate": 0.00011986247273475206, + "loss": 1.6953, + "step": 723 + }, + { + "epoch": 1.31, + "grad_norm": 0.05866590142250061, + "learning_rate": 0.0001196740029472992, + "loss": 1.6935, + "step": 724 + }, + { + "epoch": 1.31, + "grad_norm": 0.06476990133523941, + "learning_rate": 0.00011948546043409123, + "loss": 1.7017, + "step": 725 + }, + { + "epoch": 1.31, + "grad_norm": 0.06523357331752777, + "learning_rate": 0.00011929684589208326, + "loss": 1.7183, + "step": 726 + }, + { + "epoch": 1.31, + "grad_norm": 0.060969460755586624, + "learning_rate": 0.00011910816001849654, + "loss": 1.6887, + "step": 727 + }, + { + "epoch": 1.31, + "grad_norm": 0.11310483515262604, + "learning_rate": 0.00011891940351081625, + "loss": 1.6816, + "step": 728 + }, + { + "epoch": 1.32, + "grad_norm": 0.059255216270685196, + "learning_rate": 0.00011873057706678843, + "loss": 1.6554, + "step": 729 + }, + { + "epoch": 1.32, + "grad_norm": 0.062034714967012405, + "learning_rate": 0.00011854168138441775, + "loss": 1.668, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 0.06186864525079727, + "learning_rate": 0.00011835271716196486, + "loss": 1.6806, + "step": 731 + }, + { + "epoch": 1.32, + "grad_norm": 0.06105494871735573, + "learning_rate": 0.00011816368509794364, + "loss": 1.6615, + "step": 732 + }, + { + "epoch": 1.32, + "grad_norm": 0.06231169030070305, + "learning_rate": 0.00011797458589111894, + "loss": 1.6588, + "step": 733 + }, + { + "epoch": 1.33, + "grad_norm": 0.06832422316074371, + "learning_rate": 0.00011778542024050361, + "loss": 1.6758, + "step": 734 + }, + { + "epoch": 1.33, + "grad_norm": 0.06158846989274025, + "learning_rate": 0.00011759618884535624, + "loss": 1.7025, + "step": 735 + }, + { + "epoch": 1.33, + "grad_norm": 0.07147394865751266, + "learning_rate": 0.00011740689240517837, + "loss": 1.6691, + "step": 736 + }, + { + "epoch": 1.33, + "grad_norm": 0.06047786399722099, + "learning_rate": 0.00011721753161971212, + "loss": 1.6968, + "step": 737 + }, + { + "epoch": 1.33, + "grad_norm": 0.0623675100505352, + "learning_rate": 0.00011702810718893722, + "loss": 1.7372, + "step": 738 + }, + { + "epoch": 1.34, + "grad_norm": 0.06291418522596359, + "learning_rate": 0.00011683861981306893, + "loss": 1.7083, + "step": 739 + }, + { + "epoch": 1.34, + "grad_norm": 0.059522755444049835, + "learning_rate": 0.00011664907019255502, + "loss": 1.6533, + "step": 740 + }, + { + "epoch": 1.34, + "grad_norm": 0.060890860855579376, + "learning_rate": 0.00011645945902807341, + "loss": 1.6875, + "step": 741 + }, + { + "epoch": 1.34, + "grad_norm": 0.060426972806453705, + "learning_rate": 0.00011626978702052948, + "loss": 1.6463, + "step": 742 + }, + { + "epoch": 1.34, + "grad_norm": 0.062305621802806854, + "learning_rate": 0.00011608005487105362, + "loss": 1.6785, + "step": 743 + }, + { + "epoch": 1.34, + "grad_norm": 0.06419097632169724, + "learning_rate": 0.00011589026328099839, + "loss": 1.6679, + "step": 744 + }, + { + "epoch": 1.35, + "grad_norm": 0.06365741044282913, + "learning_rate": 0.00011570041295193622, + "loss": 1.6668, + "step": 745 + }, + { + "epoch": 1.35, + "grad_norm": 0.0642697736620903, + "learning_rate": 0.00011551050458565658, + "loss": 1.7095, + "step": 746 + }, + { + "epoch": 1.35, + "grad_norm": 0.06443499773740768, + "learning_rate": 0.00011532053888416343, + "loss": 1.6586, + "step": 747 + }, + { + "epoch": 1.35, + "grad_norm": 0.06351306289434433, + "learning_rate": 0.00011513051654967286, + "loss": 1.6776, + "step": 748 + }, + { + "epoch": 1.35, + "grad_norm": 0.06554794311523438, + "learning_rate": 0.00011494043828461007, + "loss": 1.7105, + "step": 749 + }, + { + "epoch": 1.36, + "grad_norm": 0.10256826132535934, + "learning_rate": 0.00011475030479160725, + "loss": 1.7046, + "step": 750 + }, + { + "epoch": 1.36, + "grad_norm": 0.06379935145378113, + "learning_rate": 0.00011456011677350051, + "loss": 1.711, + "step": 751 + }, + { + "epoch": 1.36, + "grad_norm": 0.06044677272439003, + "learning_rate": 0.00011436987493332767, + "loss": 1.7186, + "step": 752 + }, + { + "epoch": 1.36, + "grad_norm": 0.06297197192907333, + "learning_rate": 0.00011417957997432546, + "loss": 1.6453, + "step": 753 + }, + { + "epoch": 1.36, + "grad_norm": 0.06677673757076263, + "learning_rate": 0.00011398923259992697, + "loss": 1.6443, + "step": 754 + }, + { + "epoch": 1.36, + "grad_norm": 0.062335170805454254, + "learning_rate": 0.00011379883351375901, + "loss": 1.6738, + "step": 755 + }, + { + "epoch": 1.37, + "grad_norm": 0.06286536902189255, + "learning_rate": 0.00011360838341963964, + "loss": 1.7081, + "step": 756 + }, + { + "epoch": 1.37, + "grad_norm": 0.07303211838006973, + "learning_rate": 0.00011341788302157536, + "loss": 1.6906, + "step": 757 + }, + { + "epoch": 1.37, + "grad_norm": 0.06304056942462921, + "learning_rate": 0.00011322733302375863, + "loss": 1.6783, + "step": 758 + }, + { + "epoch": 1.37, + "grad_norm": 0.07291906327009201, + "learning_rate": 0.00011303673413056541, + "loss": 1.7162, + "step": 759 + }, + { + "epoch": 1.37, + "grad_norm": 0.061802685260772705, + "learning_rate": 0.00011284608704655215, + "loss": 1.7375, + "step": 760 + }, + { + "epoch": 1.38, + "grad_norm": 0.06205203756690025, + "learning_rate": 0.00011265539247645373, + "loss": 1.6617, + "step": 761 + }, + { + "epoch": 1.38, + "grad_norm": 0.06457790732383728, + "learning_rate": 0.0001124646511251803, + "loss": 1.6395, + "step": 762 + }, + { + "epoch": 1.38, + "grad_norm": 0.06102142482995987, + "learning_rate": 0.00011227386369781508, + "loss": 1.7031, + "step": 763 + }, + { + "epoch": 1.38, + "grad_norm": 0.062267519533634186, + "learning_rate": 0.00011208303089961161, + "loss": 1.6889, + "step": 764 + }, + { + "epoch": 1.38, + "grad_norm": 0.06354745477437973, + "learning_rate": 0.00011189215343599109, + "loss": 1.7099, + "step": 765 + }, + { + "epoch": 1.38, + "grad_norm": 0.06255058199167252, + "learning_rate": 0.00011170123201253986, + "loss": 1.7092, + "step": 766 + }, + { + "epoch": 1.39, + "grad_norm": 0.06354597955942154, + "learning_rate": 0.00011151026733500677, + "loss": 1.6462, + "step": 767 + }, + { + "epoch": 1.39, + "grad_norm": 0.06314928829669952, + "learning_rate": 0.00011131926010930058, + "loss": 1.6377, + "step": 768 + }, + { + "epoch": 1.39, + "grad_norm": 0.06911808252334595, + "learning_rate": 0.00011112821104148723, + "loss": 1.6787, + "step": 769 + }, + { + "epoch": 1.39, + "grad_norm": 0.06356338411569595, + "learning_rate": 0.00011093712083778746, + "loss": 1.6657, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 0.06266220659017563, + "learning_rate": 0.00011074599020457395, + "loss": 1.7108, + "step": 771 + }, + { + "epoch": 1.4, + "grad_norm": 0.06397093832492828, + "learning_rate": 0.00011055481984836893, + "loss": 1.715, + "step": 772 + }, + { + "epoch": 1.4, + "grad_norm": 0.06519615650177002, + "learning_rate": 0.00011036361047584143, + "loss": 1.6625, + "step": 773 + }, + { + "epoch": 1.4, + "grad_norm": 0.06543872505426407, + "learning_rate": 0.00011017236279380467, + "loss": 1.6611, + "step": 774 + }, + { + "epoch": 1.4, + "grad_norm": 0.06356982886791229, + "learning_rate": 0.00010998107750921354, + "loss": 1.6366, + "step": 775 + }, + { + "epoch": 1.4, + "grad_norm": 0.06404688209295273, + "learning_rate": 0.00010978975532916189, + "loss": 1.689, + "step": 776 + }, + { + "epoch": 1.4, + "grad_norm": 0.06206212565302849, + "learning_rate": 0.00010959839696088001, + "loss": 1.6853, + "step": 777 + }, + { + "epoch": 1.41, + "grad_norm": 0.0640236884355545, + "learning_rate": 0.00010940700311173184, + "loss": 1.6874, + "step": 778 + }, + { + "epoch": 1.41, + "grad_norm": 0.06289862096309662, + "learning_rate": 0.00010921557448921267, + "loss": 1.7187, + "step": 779 + }, + { + "epoch": 1.41, + "grad_norm": 0.06534165889024734, + "learning_rate": 0.00010902411180094607, + "loss": 1.6285, + "step": 780 + }, + { + "epoch": 1.41, + "grad_norm": 0.06411545723676682, + "learning_rate": 0.00010883261575468184, + "loss": 1.6932, + "step": 781 + }, + { + "epoch": 1.41, + "grad_norm": 0.06283684074878693, + "learning_rate": 0.00010864108705829282, + "loss": 1.7544, + "step": 782 + }, + { + "epoch": 1.42, + "grad_norm": 0.06294089555740356, + "learning_rate": 0.00010844952641977273, + "loss": 1.695, + "step": 783 + }, + { + "epoch": 1.42, + "grad_norm": 0.06469050794839859, + "learning_rate": 0.00010825793454723325, + "loss": 1.654, + "step": 784 + }, + { + "epoch": 1.42, + "grad_norm": 0.06504753977060318, + "learning_rate": 0.00010806631214890155, + "loss": 1.6641, + "step": 785 + }, + { + "epoch": 1.42, + "grad_norm": 0.06289339065551758, + "learning_rate": 0.00010787465993311768, + "loss": 1.7246, + "step": 786 + }, + { + "epoch": 1.42, + "grad_norm": 0.07044830918312073, + "learning_rate": 0.00010768297860833185, + "loss": 1.6784, + "step": 787 + }, + { + "epoch": 1.42, + "grad_norm": 0.06241421401500702, + "learning_rate": 0.00010749126888310197, + "loss": 1.7413, + "step": 788 + }, + { + "epoch": 1.43, + "grad_norm": 0.061875198036432266, + "learning_rate": 0.00010729953146609076, + "loss": 1.6837, + "step": 789 + }, + { + "epoch": 1.43, + "grad_norm": 0.06335246562957764, + "learning_rate": 0.00010710776706606349, + "loss": 1.6713, + "step": 790 + }, + { + "epoch": 1.43, + "grad_norm": 0.06218186393380165, + "learning_rate": 0.00010691597639188507, + "loss": 1.6563, + "step": 791 + }, + { + "epoch": 1.43, + "grad_norm": 0.06283168494701385, + "learning_rate": 0.00010672416015251757, + "loss": 1.6672, + "step": 792 + }, + { + "epoch": 1.43, + "grad_norm": 0.06283591687679291, + "learning_rate": 0.00010653231905701748, + "loss": 1.6719, + "step": 793 + }, + { + "epoch": 1.44, + "grad_norm": 0.0629267543554306, + "learning_rate": 0.00010634045381453337, + "loss": 1.6764, + "step": 794 + }, + { + "epoch": 1.44, + "grad_norm": 0.06264865398406982, + "learning_rate": 0.00010614856513430284, + "loss": 1.6874, + "step": 795 + }, + { + "epoch": 1.44, + "grad_norm": 0.06411181390285492, + "learning_rate": 0.00010595665372565027, + "loss": 1.7095, + "step": 796 + }, + { + "epoch": 1.44, + "grad_norm": 0.06262548267841339, + "learning_rate": 0.00010576472029798399, + "loss": 1.6898, + "step": 797 + }, + { + "epoch": 1.44, + "grad_norm": 0.06278496235609055, + "learning_rate": 0.00010557276556079378, + "loss": 1.6055, + "step": 798 + }, + { + "epoch": 1.44, + "grad_norm": 0.06674374639987946, + "learning_rate": 0.00010538079022364819, + "loss": 1.7226, + "step": 799 + }, + { + "epoch": 1.45, + "grad_norm": 0.06753117591142654, + "learning_rate": 0.00010518879499619181, + "loss": 1.7008, + "step": 800 + }, + { + "epoch": 1.45, + "grad_norm": 0.07137101143598557, + "learning_rate": 0.0001049967805881429, + "loss": 1.6945, + "step": 801 + }, + { + "epoch": 1.45, + "grad_norm": 0.06417196989059448, + "learning_rate": 0.00010480474770929054, + "loss": 1.6662, + "step": 802 + }, + { + "epoch": 1.45, + "grad_norm": 0.064505934715271, + "learning_rate": 0.00010461269706949213, + "loss": 1.6914, + "step": 803 + }, + { + "epoch": 1.45, + "grad_norm": 0.06325452029705048, + "learning_rate": 0.00010442062937867063, + "loss": 1.6703, + "step": 804 + }, + { + "epoch": 1.46, + "grad_norm": 0.0945320799946785, + "learning_rate": 0.00010422854534681219, + "loss": 1.6595, + "step": 805 + }, + { + "epoch": 1.46, + "grad_norm": 0.07015063613653183, + "learning_rate": 0.00010403644568396322, + "loss": 1.7153, + "step": 806 + }, + { + "epoch": 1.46, + "grad_norm": 0.06436234712600708, + "learning_rate": 0.000103844331100228, + "loss": 1.6767, + "step": 807 + }, + { + "epoch": 1.46, + "grad_norm": 0.06437043845653534, + "learning_rate": 0.0001036522023057659, + "loss": 1.7026, + "step": 808 + }, + { + "epoch": 1.46, + "grad_norm": 0.06160353124141693, + "learning_rate": 0.00010346006001078885, + "loss": 1.7112, + "step": 809 + }, + { + "epoch": 1.46, + "grad_norm": 0.06519316881895065, + "learning_rate": 0.00010326790492555876, + "loss": 1.6611, + "step": 810 + }, + { + "epoch": 1.47, + "grad_norm": 0.06452979147434235, + "learning_rate": 0.00010307573776038462, + "loss": 1.6291, + "step": 811 + }, + { + "epoch": 1.47, + "grad_norm": 0.06813566386699677, + "learning_rate": 0.00010288355922562034, + "loss": 1.6432, + "step": 812 + }, + { + "epoch": 1.47, + "grad_norm": 0.06800167262554169, + "learning_rate": 0.0001026913700316616, + "loss": 1.6739, + "step": 813 + }, + { + "epoch": 1.47, + "grad_norm": 0.062173567712306976, + "learning_rate": 0.0001024991708889437, + "loss": 1.7207, + "step": 814 + }, + { + "epoch": 1.47, + "grad_norm": 0.06301440298557281, + "learning_rate": 0.00010230696250793856, + "loss": 1.6348, + "step": 815 + }, + { + "epoch": 1.48, + "grad_norm": 0.06262702494859695, + "learning_rate": 0.00010211474559915233, + "loss": 1.6982, + "step": 816 + }, + { + "epoch": 1.48, + "grad_norm": 0.06448613107204437, + "learning_rate": 0.00010192252087312265, + "loss": 1.7004, + "step": 817 + }, + { + "epoch": 1.48, + "grad_norm": 0.06269077211618423, + "learning_rate": 0.00010173028904041606, + "loss": 1.6981, + "step": 818 + }, + { + "epoch": 1.48, + "grad_norm": 0.06326784938573837, + "learning_rate": 0.00010153805081162539, + "loss": 1.718, + "step": 819 + }, + { + "epoch": 1.48, + "grad_norm": 0.06502313911914825, + "learning_rate": 0.0001013458068973671, + "loss": 1.6669, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 0.06869412958621979, + "learning_rate": 0.0001011535580082787, + "loss": 1.6237, + "step": 821 + }, + { + "epoch": 1.49, + "grad_norm": 0.0637192502617836, + "learning_rate": 0.00010096130485501598, + "loss": 1.7264, + "step": 822 + }, + { + "epoch": 1.49, + "eval_loss": 1.7267118692398071, + "eval_runtime": 76.2251, + "eval_samples_per_second": 65.595, + "eval_steps_per_second": 16.399, + "step": 822 + }, + { + "epoch": 1.49, + "grad_norm": 0.06338479369878769, + "learning_rate": 0.00010076904814825066, + "loss": 1.66, + "step": 823 + }, + { + "epoch": 1.49, + "grad_norm": 0.0718810185790062, + "learning_rate": 0.0001005767885986674, + "loss": 1.7044, + "step": 824 + }, + { + "epoch": 1.49, + "grad_norm": 0.06428621709346771, + "learning_rate": 0.00010038452691696161, + "loss": 1.6375, + "step": 825 + }, + { + "epoch": 1.49, + "grad_norm": 0.06198599189519882, + "learning_rate": 0.00010019226381383633, + "loss": 1.644, + "step": 826 + }, + { + "epoch": 1.5, + "grad_norm": 0.0649799108505249, + "learning_rate": 0.0001, + "loss": 1.6751, + "step": 827 + }, + { + "epoch": 1.5, + "grad_norm": 0.06546121090650558, + "learning_rate": 9.980773618616371e-05, + "loss": 1.6728, + "step": 828 + }, + { + "epoch": 1.5, + "grad_norm": 0.0744151845574379, + "learning_rate": 9.961547308303844e-05, + "loss": 1.7465, + "step": 829 + }, + { + "epoch": 1.5, + "grad_norm": 0.06264037638902664, + "learning_rate": 9.942321140133261e-05, + "loss": 1.6005, + "step": 830 + }, + { + "epoch": 1.5, + "grad_norm": 0.06265675276517868, + "learning_rate": 9.923095185174938e-05, + "loss": 1.7181, + "step": 831 + }, + { + "epoch": 1.5, + "grad_norm": 0.06809694319963455, + "learning_rate": 9.903869514498402e-05, + "loss": 1.6345, + "step": 832 + }, + { + "epoch": 1.51, + "grad_norm": 0.06538775563240051, + "learning_rate": 9.884644199172135e-05, + "loss": 1.7251, + "step": 833 + }, + { + "epoch": 1.51, + "grad_norm": 0.06529638916254044, + "learning_rate": 9.865419310263292e-05, + "loss": 1.6418, + "step": 834 + }, + { + "epoch": 1.51, + "grad_norm": 0.08285729587078094, + "learning_rate": 9.846194918837462e-05, + "loss": 1.6837, + "step": 835 + }, + { + "epoch": 1.51, + "grad_norm": 0.06490971148014069, + "learning_rate": 9.826971095958395e-05, + "loss": 1.6723, + "step": 836 + }, + { + "epoch": 1.51, + "grad_norm": 0.06375712156295776, + "learning_rate": 9.807747912687739e-05, + "loss": 1.6838, + "step": 837 + }, + { + "epoch": 1.52, + "grad_norm": 0.06696437299251556, + "learning_rate": 9.788525440084771e-05, + "loss": 1.6579, + "step": 838 + }, + { + "epoch": 1.52, + "grad_norm": 0.06473565846681595, + "learning_rate": 9.769303749206146e-05, + "loss": 1.6489, + "step": 839 + }, + { + "epoch": 1.52, + "grad_norm": 0.07211591303348541, + "learning_rate": 9.750082911105634e-05, + "loss": 1.7435, + "step": 840 + }, + { + "epoch": 1.52, + "grad_norm": 0.06550677120685577, + "learning_rate": 9.730862996833841e-05, + "loss": 1.6935, + "step": 841 + }, + { + "epoch": 1.52, + "grad_norm": 0.06820110231637955, + "learning_rate": 9.711644077437968e-05, + "loss": 1.6759, + "step": 842 + }, + { + "epoch": 1.52, + "grad_norm": 0.06783100217580795, + "learning_rate": 9.692426223961537e-05, + "loss": 1.7081, + "step": 843 + }, + { + "epoch": 1.53, + "grad_norm": 0.06615381687879562, + "learning_rate": 9.67320950744413e-05, + "loss": 1.7375, + "step": 844 + }, + { + "epoch": 1.53, + "grad_norm": 0.0648663192987442, + "learning_rate": 9.653993998921118e-05, + "loss": 1.6836, + "step": 845 + }, + { + "epoch": 1.53, + "grad_norm": 0.0639321506023407, + "learning_rate": 9.63477976942341e-05, + "loss": 1.6319, + "step": 846 + }, + { + "epoch": 1.53, + "grad_norm": 0.06528212130069733, + "learning_rate": 9.615566889977201e-05, + "loss": 1.6675, + "step": 847 + }, + { + "epoch": 1.53, + "grad_norm": 0.06574473530054092, + "learning_rate": 9.59635543160368e-05, + "loss": 1.6442, + "step": 848 + }, + { + "epoch": 1.54, + "grad_norm": 0.06326039880514145, + "learning_rate": 9.577145465318783e-05, + "loss": 1.639, + "step": 849 + }, + { + "epoch": 1.54, + "grad_norm": 0.06851720809936523, + "learning_rate": 9.557937062132938e-05, + "loss": 1.7044, + "step": 850 + }, + { + "epoch": 1.54, + "grad_norm": 0.06546233594417572, + "learning_rate": 9.538730293050792e-05, + "loss": 1.7091, + "step": 851 + }, + { + "epoch": 1.54, + "grad_norm": 0.0674884095788002, + "learning_rate": 9.51952522907095e-05, + "loss": 1.6572, + "step": 852 + }, + { + "epoch": 1.54, + "grad_norm": 0.06366416811943054, + "learning_rate": 9.50032194118571e-05, + "loss": 1.6913, + "step": 853 + }, + { + "epoch": 1.54, + "grad_norm": 0.065780408680439, + "learning_rate": 9.481120500380818e-05, + "loss": 1.7106, + "step": 854 + }, + { + "epoch": 1.55, + "grad_norm": 0.06662867218255997, + "learning_rate": 9.461920977635184e-05, + "loss": 1.6486, + "step": 855 + }, + { + "epoch": 1.55, + "grad_norm": 0.06339140236377716, + "learning_rate": 9.442723443920623e-05, + "loss": 1.6799, + "step": 856 + }, + { + "epoch": 1.55, + "grad_norm": 0.06222783029079437, + "learning_rate": 9.423527970201602e-05, + "loss": 1.72, + "step": 857 + }, + { + "epoch": 1.55, + "grad_norm": 0.06612752377986908, + "learning_rate": 9.404334627434974e-05, + "loss": 1.7294, + "step": 858 + }, + { + "epoch": 1.55, + "grad_norm": 0.06335198134183884, + "learning_rate": 9.385143486569718e-05, + "loss": 1.6978, + "step": 859 + }, + { + "epoch": 1.56, + "grad_norm": 0.0652630627155304, + "learning_rate": 9.365954618546665e-05, + "loss": 1.6808, + "step": 860 + }, + { + "epoch": 1.56, + "grad_norm": 0.08252695202827454, + "learning_rate": 9.346768094298252e-05, + "loss": 1.7117, + "step": 861 + }, + { + "epoch": 1.56, + "grad_norm": 0.0695163905620575, + "learning_rate": 9.327583984748248e-05, + "loss": 1.6948, + "step": 862 + }, + { + "epoch": 1.56, + "grad_norm": 0.06612583249807358, + "learning_rate": 9.308402360811497e-05, + "loss": 1.705, + "step": 863 + }, + { + "epoch": 1.56, + "grad_norm": 0.06415654718875885, + "learning_rate": 9.289223293393652e-05, + "loss": 1.6796, + "step": 864 + }, + { + "epoch": 1.56, + "grad_norm": 0.06522924453020096, + "learning_rate": 9.270046853390925e-05, + "loss": 1.6783, + "step": 865 + }, + { + "epoch": 1.57, + "grad_norm": 0.06422727555036545, + "learning_rate": 9.250873111689808e-05, + "loss": 1.709, + "step": 866 + }, + { + "epoch": 1.57, + "grad_norm": 0.06485796719789505, + "learning_rate": 9.231702139166816e-05, + "loss": 1.6323, + "step": 867 + }, + { + "epoch": 1.57, + "grad_norm": 0.06597612798213959, + "learning_rate": 9.212534006688233e-05, + "loss": 1.6578, + "step": 868 + }, + { + "epoch": 1.57, + "grad_norm": 0.06861060112714767, + "learning_rate": 9.193368785109844e-05, + "loss": 1.6711, + "step": 869 + }, + { + "epoch": 1.57, + "grad_norm": 0.07582002878189087, + "learning_rate": 9.174206545276677e-05, + "loss": 1.666, + "step": 870 + }, + { + "epoch": 1.58, + "grad_norm": 0.06606924533843994, + "learning_rate": 9.15504735802273e-05, + "loss": 1.7304, + "step": 871 + }, + { + "epoch": 1.58, + "grad_norm": 0.06642486900091171, + "learning_rate": 9.135891294170718e-05, + "loss": 1.7082, + "step": 872 + }, + { + "epoch": 1.58, + "grad_norm": 0.072264164686203, + "learning_rate": 9.11673842453182e-05, + "loss": 1.6355, + "step": 873 + }, + { + "epoch": 1.58, + "grad_norm": 0.06571400165557861, + "learning_rate": 9.097588819905394e-05, + "loss": 1.6383, + "step": 874 + }, + { + "epoch": 1.58, + "grad_norm": 0.062258243560791016, + "learning_rate": 9.078442551078736e-05, + "loss": 1.6676, + "step": 875 + }, + { + "epoch": 1.58, + "grad_norm": 0.06381349265575409, + "learning_rate": 9.059299688826816e-05, + "loss": 1.699, + "step": 876 + }, + { + "epoch": 1.59, + "grad_norm": 0.06702978163957596, + "learning_rate": 9.040160303912003e-05, + "loss": 1.7245, + "step": 877 + }, + { + "epoch": 1.59, + "grad_norm": 0.0637059286236763, + "learning_rate": 9.021024467083812e-05, + "loss": 1.6478, + "step": 878 + }, + { + "epoch": 1.59, + "grad_norm": 0.0654047429561615, + "learning_rate": 9.001892249078648e-05, + "loss": 1.7275, + "step": 879 + }, + { + "epoch": 1.59, + "grad_norm": 0.06602399051189423, + "learning_rate": 8.982763720619533e-05, + "loss": 1.7712, + "step": 880 + }, + { + "epoch": 1.59, + "grad_norm": 0.06693969666957855, + "learning_rate": 8.96363895241586e-05, + "loss": 1.6684, + "step": 881 + }, + { + "epoch": 1.6, + "grad_norm": 0.06519246846437454, + "learning_rate": 8.944518015163108e-05, + "loss": 1.6698, + "step": 882 + }, + { + "epoch": 1.6, + "grad_norm": 0.06838595122098923, + "learning_rate": 8.925400979542606e-05, + "loss": 1.655, + "step": 883 + }, + { + "epoch": 1.6, + "grad_norm": 0.06535571813583374, + "learning_rate": 8.906287916221259e-05, + "loss": 1.6858, + "step": 884 + }, + { + "epoch": 1.6, + "grad_norm": 0.06805121898651123, + "learning_rate": 8.887178895851279e-05, + "loss": 1.6746, + "step": 885 + }, + { + "epoch": 1.6, + "grad_norm": 0.0715852826833725, + "learning_rate": 8.868073989069943e-05, + "loss": 1.7676, + "step": 886 + }, + { + "epoch": 1.6, + "grad_norm": 0.06408550590276718, + "learning_rate": 8.848973266499322e-05, + "loss": 1.6434, + "step": 887 + }, + { + "epoch": 1.61, + "grad_norm": 0.0682334452867508, + "learning_rate": 8.829876798746017e-05, + "loss": 1.6663, + "step": 888 + }, + { + "epoch": 1.61, + "grad_norm": 0.06532958894968033, + "learning_rate": 8.810784656400895e-05, + "loss": 1.6914, + "step": 889 + }, + { + "epoch": 1.61, + "grad_norm": 0.06579031050205231, + "learning_rate": 8.791696910038843e-05, + "loss": 1.6359, + "step": 890 + }, + { + "epoch": 1.61, + "grad_norm": 0.0659404769539833, + "learning_rate": 8.772613630218492e-05, + "loss": 1.7121, + "step": 891 + }, + { + "epoch": 1.61, + "grad_norm": 0.06567792594432831, + "learning_rate": 8.753534887481976e-05, + "loss": 1.6565, + "step": 892 + }, + { + "epoch": 1.62, + "grad_norm": 0.07625501602888107, + "learning_rate": 8.734460752354629e-05, + "loss": 1.6743, + "step": 893 + }, + { + "epoch": 1.62, + "grad_norm": 0.06591348350048065, + "learning_rate": 8.715391295344784e-05, + "loss": 1.6265, + "step": 894 + }, + { + "epoch": 1.62, + "grad_norm": 0.06538601964712143, + "learning_rate": 8.696326586943464e-05, + "loss": 1.7139, + "step": 895 + }, + { + "epoch": 1.62, + "grad_norm": 0.06885919719934464, + "learning_rate": 8.677266697624138e-05, + "loss": 1.6884, + "step": 896 + }, + { + "epoch": 1.62, + "grad_norm": 0.06452605873346329, + "learning_rate": 8.658211697842466e-05, + "loss": 1.6894, + "step": 897 + }, + { + "epoch": 1.62, + "grad_norm": 0.06521788239479065, + "learning_rate": 8.639161658036037e-05, + "loss": 1.6943, + "step": 898 + }, + { + "epoch": 1.63, + "grad_norm": 0.06771497428417206, + "learning_rate": 8.6201166486241e-05, + "loss": 1.6718, + "step": 899 + }, + { + "epoch": 1.63, + "grad_norm": 0.0637250766158104, + "learning_rate": 8.601076740007305e-05, + "loss": 1.6842, + "step": 900 + }, + { + "epoch": 1.63, + "grad_norm": 0.0656089335680008, + "learning_rate": 8.582042002567456e-05, + "loss": 1.6649, + "step": 901 + }, + { + "epoch": 1.63, + "grad_norm": 0.06827680766582489, + "learning_rate": 8.563012506667233e-05, + "loss": 1.7095, + "step": 902 + }, + { + "epoch": 1.63, + "grad_norm": 0.06502600759267807, + "learning_rate": 8.543988322649954e-05, + "loss": 1.6368, + "step": 903 + }, + { + "epoch": 1.64, + "grad_norm": 0.06803898513317108, + "learning_rate": 8.524969520839279e-05, + "loss": 1.657, + "step": 904 + }, + { + "epoch": 1.64, + "grad_norm": 0.06632059067487717, + "learning_rate": 8.505956171538994e-05, + "loss": 1.7279, + "step": 905 + }, + { + "epoch": 1.64, + "grad_norm": 0.06838211417198181, + "learning_rate": 8.486948345032719e-05, + "loss": 1.6318, + "step": 906 + }, + { + "epoch": 1.64, + "grad_norm": 0.0652574896812439, + "learning_rate": 8.46794611158366e-05, + "loss": 1.6307, + "step": 907 + }, + { + "epoch": 1.64, + "grad_norm": 0.0648072361946106, + "learning_rate": 8.448949541434346e-05, + "loss": 1.6517, + "step": 908 + }, + { + "epoch": 1.64, + "grad_norm": 0.06592056900262833, + "learning_rate": 8.429958704806379e-05, + "loss": 1.6958, + "step": 909 + }, + { + "epoch": 1.65, + "grad_norm": 0.06285024434328079, + "learning_rate": 8.410973671900162e-05, + "loss": 1.666, + "step": 910 + }, + { + "epoch": 1.65, + "grad_norm": 0.06529216468334198, + "learning_rate": 8.391994512894641e-05, + "loss": 1.6919, + "step": 911 + }, + { + "epoch": 1.65, + "grad_norm": 0.06455468386411667, + "learning_rate": 8.373021297947053e-05, + "loss": 1.6217, + "step": 912 + }, + { + "epoch": 1.65, + "grad_norm": 0.06522978842258453, + "learning_rate": 8.35405409719266e-05, + "loss": 1.6729, + "step": 913 + }, + { + "epoch": 1.65, + "grad_norm": 0.06686036288738251, + "learning_rate": 8.335092980744502e-05, + "loss": 1.6324, + "step": 914 + }, + { + "epoch": 1.66, + "grad_norm": 0.06648086756467819, + "learning_rate": 8.316138018693108e-05, + "loss": 1.6052, + "step": 915 + }, + { + "epoch": 1.66, + "grad_norm": 0.06622032076120377, + "learning_rate": 8.297189281106278e-05, + "loss": 1.7219, + "step": 916 + }, + { + "epoch": 1.66, + "grad_norm": 0.07183654606342316, + "learning_rate": 8.278246838028793e-05, + "loss": 1.7633, + "step": 917 + }, + { + "epoch": 1.66, + "grad_norm": 0.06654607504606247, + "learning_rate": 8.259310759482164e-05, + "loss": 1.7602, + "step": 918 + }, + { + "epoch": 1.66, + "grad_norm": 0.06768395006656647, + "learning_rate": 8.240381115464377e-05, + "loss": 1.678, + "step": 919 + }, + { + "epoch": 1.66, + "grad_norm": 0.0649079754948616, + "learning_rate": 8.22145797594964e-05, + "loss": 1.7013, + "step": 920 + }, + { + "epoch": 1.67, + "grad_norm": 0.06565246731042862, + "learning_rate": 8.20254141088811e-05, + "loss": 1.7064, + "step": 921 + }, + { + "epoch": 1.67, + "grad_norm": 0.06477197259664536, + "learning_rate": 8.183631490205637e-05, + "loss": 1.7219, + "step": 922 + }, + { + "epoch": 1.67, + "grad_norm": 0.06408128142356873, + "learning_rate": 8.164728283803518e-05, + "loss": 1.7337, + "step": 923 + }, + { + "epoch": 1.67, + "grad_norm": 0.06464950740337372, + "learning_rate": 8.145831861558225e-05, + "loss": 1.6853, + "step": 924 + }, + { + "epoch": 1.67, + "grad_norm": 0.06401928514242172, + "learning_rate": 8.126942293321162e-05, + "loss": 1.6587, + "step": 925 + }, + { + "epoch": 1.68, + "grad_norm": 0.06978955864906311, + "learning_rate": 8.108059648918377e-05, + "loss": 1.7083, + "step": 926 + }, + { + "epoch": 1.68, + "grad_norm": 0.06544001400470734, + "learning_rate": 8.089183998150344e-05, + "loss": 1.6318, + "step": 927 + }, + { + "epoch": 1.68, + "grad_norm": 0.06558380275964737, + "learning_rate": 8.070315410791679e-05, + "loss": 1.6897, + "step": 928 + }, + { + "epoch": 1.68, + "grad_norm": 0.06930231302976608, + "learning_rate": 8.051453956590878e-05, + "loss": 1.6266, + "step": 929 + }, + { + "epoch": 1.68, + "grad_norm": 0.06593599915504456, + "learning_rate": 8.03259970527008e-05, + "loss": 1.7096, + "step": 930 + }, + { + "epoch": 1.69, + "grad_norm": 0.06622833758592606, + "learning_rate": 8.013752726524795e-05, + "loss": 1.5817, + "step": 931 + }, + { + "epoch": 1.69, + "grad_norm": 0.06626243144273758, + "learning_rate": 7.994913090023651e-05, + "loss": 1.6525, + "step": 932 + }, + { + "epoch": 1.69, + "grad_norm": 0.0677393451333046, + "learning_rate": 7.976080865408131e-05, + "loss": 1.7158, + "step": 933 + }, + { + "epoch": 1.69, + "grad_norm": 0.06529498845338821, + "learning_rate": 7.957256122292323e-05, + "loss": 1.7317, + "step": 934 + }, + { + "epoch": 1.69, + "grad_norm": 0.07396451383829117, + "learning_rate": 7.938438930262656e-05, + "loss": 1.6791, + "step": 935 + }, + { + "epoch": 1.69, + "grad_norm": 0.07032353430986404, + "learning_rate": 7.919629358877657e-05, + "loss": 1.7024, + "step": 936 + }, + { + "epoch": 1.7, + "grad_norm": 0.06451990455389023, + "learning_rate": 7.900827477667663e-05, + "loss": 1.7266, + "step": 937 + }, + { + "epoch": 1.7, + "grad_norm": 0.06694858521223068, + "learning_rate": 7.882033356134603e-05, + "loss": 1.6612, + "step": 938 + }, + { + "epoch": 1.7, + "grad_norm": 0.06609500199556351, + "learning_rate": 7.863247063751715e-05, + "loss": 1.713, + "step": 939 + }, + { + "epoch": 1.7, + "grad_norm": 0.06344272941350937, + "learning_rate": 7.844468669963289e-05, + "loss": 1.6219, + "step": 940 + }, + { + "epoch": 1.7, + "grad_norm": 0.06307589262723923, + "learning_rate": 7.825698244184431e-05, + "loss": 1.7042, + "step": 941 + }, + { + "epoch": 1.71, + "grad_norm": 0.06659837812185287, + "learning_rate": 7.806935855800782e-05, + "loss": 1.6993, + "step": 942 + }, + { + "epoch": 1.71, + "grad_norm": 0.06524292379617691, + "learning_rate": 7.788181574168283e-05, + "loss": 1.6687, + "step": 943 + }, + { + "epoch": 1.71, + "grad_norm": 0.06560816615819931, + "learning_rate": 7.769435468612896e-05, + "loss": 1.7081, + "step": 944 + }, + { + "epoch": 1.71, + "grad_norm": 0.06725630909204483, + "learning_rate": 7.750697608430365e-05, + "loss": 1.7001, + "step": 945 + }, + { + "epoch": 1.71, + "grad_norm": 0.06650066375732422, + "learning_rate": 7.731968062885956e-05, + "loss": 1.7225, + "step": 946 + }, + { + "epoch": 1.71, + "grad_norm": 0.06517896801233292, + "learning_rate": 7.713246901214206e-05, + "loss": 1.6299, + "step": 947 + }, + { + "epoch": 1.72, + "grad_norm": 0.06807747483253479, + "learning_rate": 7.694534192618641e-05, + "loss": 1.695, + "step": 948 + }, + { + "epoch": 1.72, + "grad_norm": 0.06809186935424805, + "learning_rate": 7.67583000627156e-05, + "loss": 1.6611, + "step": 949 + }, + { + "epoch": 1.72, + "grad_norm": 0.06693090498447418, + "learning_rate": 7.657134411313753e-05, + "loss": 1.6603, + "step": 950 + }, + { + "epoch": 1.72, + "grad_norm": 0.06553305685520172, + "learning_rate": 7.638447476854245e-05, + "loss": 1.7036, + "step": 951 + }, + { + "epoch": 1.72, + "grad_norm": 0.06823913007974625, + "learning_rate": 7.619769271970056e-05, + "loss": 1.6848, + "step": 952 + }, + { + "epoch": 1.73, + "grad_norm": 0.0652228444814682, + "learning_rate": 7.601099865705927e-05, + "loss": 1.6893, + "step": 953 + }, + { + "epoch": 1.73, + "grad_norm": 0.07233775407075882, + "learning_rate": 7.58243932707409e-05, + "loss": 1.6777, + "step": 954 + }, + { + "epoch": 1.73, + "grad_norm": 0.07119675725698471, + "learning_rate": 7.563787725053981e-05, + "loss": 1.706, + "step": 955 + }, + { + "epoch": 1.73, + "grad_norm": 0.06489936262369156, + "learning_rate": 7.54514512859201e-05, + "loss": 1.6538, + "step": 956 + }, + { + "epoch": 1.73, + "grad_norm": 0.06696008145809174, + "learning_rate": 7.526511606601293e-05, + "loss": 1.6862, + "step": 957 + }, + { + "epoch": 1.73, + "grad_norm": 0.06405473500490189, + "learning_rate": 7.507887227961414e-05, + "loss": 1.662, + "step": 958 + }, + { + "epoch": 1.74, + "grad_norm": 0.06998445093631744, + "learning_rate": 7.489272061518136e-05, + "loss": 1.6604, + "step": 959 + }, + { + "epoch": 1.74, + "eval_loss": 1.726022481918335, + "eval_runtime": 76.3141, + "eval_samples_per_second": 65.519, + "eval_steps_per_second": 16.38, + "step": 959 + }, + { + "epoch": 1.74, + "grad_norm": 0.06673965603113174, + "learning_rate": 7.470666176083192e-05, + "loss": 1.7049, + "step": 960 + }, + { + "epoch": 1.74, + "grad_norm": 0.06746464222669601, + "learning_rate": 7.452069640433997e-05, + "loss": 1.6803, + "step": 961 + }, + { + "epoch": 1.74, + "grad_norm": 0.06396359950304031, + "learning_rate": 7.433482523313395e-05, + "loss": 1.7104, + "step": 962 + }, + { + "epoch": 1.74, + "grad_norm": 0.066098153591156, + "learning_rate": 7.414904893429433e-05, + "loss": 1.6527, + "step": 963 + }, + { + "epoch": 1.75, + "grad_norm": 0.06473662704229355, + "learning_rate": 7.39633681945507e-05, + "loss": 1.6891, + "step": 964 + }, + { + "epoch": 1.75, + "grad_norm": 0.07003339380025864, + "learning_rate": 7.377778370027962e-05, + "loss": 1.676, + "step": 965 + }, + { + "epoch": 1.75, + "grad_norm": 0.06654497236013412, + "learning_rate": 7.35922961375016e-05, + "loss": 1.6601, + "step": 966 + }, + { + "epoch": 1.75, + "grad_norm": 0.06775406002998352, + "learning_rate": 7.340690619187908e-05, + "loss": 1.6391, + "step": 967 + }, + { + "epoch": 1.75, + "grad_norm": 0.06764483451843262, + "learning_rate": 7.322161454871356e-05, + "loss": 1.7057, + "step": 968 + }, + { + "epoch": 1.75, + "grad_norm": 0.0728226825594902, + "learning_rate": 7.303642189294316e-05, + "loss": 1.6793, + "step": 969 + }, + { + "epoch": 1.76, + "grad_norm": 0.06543935835361481, + "learning_rate": 7.285132890914002e-05, + "loss": 1.6962, + "step": 970 + }, + { + "epoch": 1.76, + "grad_norm": 0.06830572336912155, + "learning_rate": 7.266633628150801e-05, + "loss": 1.6774, + "step": 971 + }, + { + "epoch": 1.76, + "grad_norm": 0.07373080402612686, + "learning_rate": 7.248144469387992e-05, + "loss": 1.6815, + "step": 972 + }, + { + "epoch": 1.76, + "grad_norm": 0.06465107947587967, + "learning_rate": 7.229665482971499e-05, + "loss": 1.6572, + "step": 973 + }, + { + "epoch": 1.76, + "grad_norm": 0.06544660031795502, + "learning_rate": 7.211196737209653e-05, + "loss": 1.6841, + "step": 974 + }, + { + "epoch": 1.77, + "grad_norm": 0.06559861451387405, + "learning_rate": 7.192738300372925e-05, + "loss": 1.6835, + "step": 975 + }, + { + "epoch": 1.77, + "grad_norm": 0.06756362318992615, + "learning_rate": 7.174290240693689e-05, + "loss": 1.5912, + "step": 976 + }, + { + "epoch": 1.77, + "grad_norm": 0.06515438854694366, + "learning_rate": 7.155852626365938e-05, + "loss": 1.6586, + "step": 977 + }, + { + "epoch": 1.77, + "grad_norm": 0.06673271209001541, + "learning_rate": 7.137425525545074e-05, + "loss": 1.67, + "step": 978 + }, + { + "epoch": 1.77, + "grad_norm": 0.06732840090990067, + "learning_rate": 7.119009006347625e-05, + "loss": 1.6262, + "step": 979 + }, + { + "epoch": 1.77, + "grad_norm": 0.0666419267654419, + "learning_rate": 7.100603136851009e-05, + "loss": 1.6963, + "step": 980 + }, + { + "epoch": 1.78, + "grad_norm": 0.07527624070644379, + "learning_rate": 7.082207985093268e-05, + "loss": 1.6903, + "step": 981 + }, + { + "epoch": 1.78, + "grad_norm": 0.06989062577486038, + "learning_rate": 7.063823619072838e-05, + "loss": 1.6497, + "step": 982 + }, + { + "epoch": 1.78, + "grad_norm": 0.0654689222574234, + "learning_rate": 7.045450106748277e-05, + "loss": 1.6782, + "step": 983 + }, + { + "epoch": 1.78, + "grad_norm": 0.06511061638593674, + "learning_rate": 7.027087516038022e-05, + "loss": 1.6824, + "step": 984 + }, + { + "epoch": 1.78, + "grad_norm": 0.06674464046955109, + "learning_rate": 7.008735914820138e-05, + "loss": 1.7367, + "step": 985 + }, + { + "epoch": 1.79, + "grad_norm": 0.06592298299074173, + "learning_rate": 6.990395370932068e-05, + "loss": 1.6879, + "step": 986 + }, + { + "epoch": 1.79, + "grad_norm": 0.06826543807983398, + "learning_rate": 6.97206595217039e-05, + "loss": 1.6682, + "step": 987 + }, + { + "epoch": 1.79, + "grad_norm": 0.06695631891489029, + "learning_rate": 6.953747726290535e-05, + "loss": 1.7181, + "step": 988 + }, + { + "epoch": 1.79, + "grad_norm": 0.06656961888074875, + "learning_rate": 6.935440761006582e-05, + "loss": 1.6778, + "step": 989 + }, + { + "epoch": 1.79, + "grad_norm": 0.06611720472574234, + "learning_rate": 6.917145123990973e-05, + "loss": 1.6467, + "step": 990 + }, + { + "epoch": 1.79, + "grad_norm": 0.06846632063388824, + "learning_rate": 6.898860882874279e-05, + "loss": 1.7165, + "step": 991 + }, + { + "epoch": 1.8, + "grad_norm": 0.06631824374198914, + "learning_rate": 6.88058810524494e-05, + "loss": 1.7042, + "step": 992 + }, + { + "epoch": 1.8, + "grad_norm": 0.06761027872562408, + "learning_rate": 6.862326858649026e-05, + "loss": 1.6822, + "step": 993 + }, + { + "epoch": 1.8, + "grad_norm": 0.06898529827594757, + "learning_rate": 6.844077210589986e-05, + "loss": 1.6635, + "step": 994 + }, + { + "epoch": 1.8, + "grad_norm": 0.06683610379695892, + "learning_rate": 6.825839228528382e-05, + "loss": 1.6949, + "step": 995 + }, + { + "epoch": 1.8, + "grad_norm": 0.06670662760734558, + "learning_rate": 6.807612979881661e-05, + "loss": 1.6724, + "step": 996 + }, + { + "epoch": 1.81, + "grad_norm": 0.19084873795509338, + "learning_rate": 6.789398532023894e-05, + "loss": 1.7499, + "step": 997 + }, + { + "epoch": 1.81, + "grad_norm": 0.06561749428510666, + "learning_rate": 6.77119595228554e-05, + "loss": 1.6733, + "step": 998 + }, + { + "epoch": 1.81, + "grad_norm": 0.07371030747890472, + "learning_rate": 6.753005307953167e-05, + "loss": 1.6607, + "step": 999 + }, + { + "epoch": 1.81, + "grad_norm": 0.0679875835776329, + "learning_rate": 6.734826666269238e-05, + "loss": 1.6233, + "step": 1000 + }, + { + "epoch": 1.81, + "grad_norm": 0.0667947381734848, + "learning_rate": 6.716660094431846e-05, + "loss": 1.6186, + "step": 1001 + }, + { + "epoch": 1.81, + "grad_norm": 0.06578990817070007, + "learning_rate": 6.698505659594466e-05, + "loss": 1.6997, + "step": 1002 + }, + { + "epoch": 1.82, + "grad_norm": 0.07320542633533478, + "learning_rate": 6.680363428865704e-05, + "loss": 1.6729, + "step": 1003 + }, + { + "epoch": 1.82, + "grad_norm": 0.06879616528749466, + "learning_rate": 6.662233469309058e-05, + "loss": 1.6982, + "step": 1004 + }, + { + "epoch": 1.82, + "grad_norm": 0.06353451311588287, + "learning_rate": 6.644115847942667e-05, + "loss": 1.6698, + "step": 1005 + }, + { + "epoch": 1.82, + "grad_norm": 0.06664732843637466, + "learning_rate": 6.626010631739054e-05, + "loss": 1.6225, + "step": 1006 + }, + { + "epoch": 1.82, + "grad_norm": 0.0662289708852768, + "learning_rate": 6.60791788762489e-05, + "loss": 1.713, + "step": 1007 + }, + { + "epoch": 1.83, + "grad_norm": 0.06735072284936905, + "learning_rate": 6.589837682480744e-05, + "loss": 1.6431, + "step": 1008 + }, + { + "epoch": 1.83, + "grad_norm": 0.06567612290382385, + "learning_rate": 6.571770083140836e-05, + "loss": 1.6972, + "step": 1009 + }, + { + "epoch": 1.83, + "grad_norm": 0.06742958724498749, + "learning_rate": 6.553715156392776e-05, + "loss": 1.6439, + "step": 1010 + }, + { + "epoch": 1.83, + "grad_norm": 0.06748675554990768, + "learning_rate": 6.535672968977345e-05, + "loss": 1.6711, + "step": 1011 + }, + { + "epoch": 1.83, + "grad_norm": 0.07259120792150497, + "learning_rate": 6.517643587588221e-05, + "loss": 1.7223, + "step": 1012 + }, + { + "epoch": 1.83, + "grad_norm": 0.07579007744789124, + "learning_rate": 6.499627078871753e-05, + "loss": 1.6614, + "step": 1013 + }, + { + "epoch": 1.84, + "grad_norm": 0.07152054458856583, + "learning_rate": 6.481623509426697e-05, + "loss": 1.7038, + "step": 1014 + }, + { + "epoch": 1.84, + "grad_norm": 0.06873390078544617, + "learning_rate": 6.463632945803981e-05, + "loss": 1.6602, + "step": 1015 + }, + { + "epoch": 1.84, + "grad_norm": 0.0664227306842804, + "learning_rate": 6.445655454506465e-05, + "loss": 1.6916, + "step": 1016 + }, + { + "epoch": 1.84, + "grad_norm": 0.06599757075309753, + "learning_rate": 6.427691101988673e-05, + "loss": 1.605, + "step": 1017 + }, + { + "epoch": 1.84, + "grad_norm": 0.06476866453886032, + "learning_rate": 6.40973995465657e-05, + "loss": 1.6309, + "step": 1018 + }, + { + "epoch": 1.85, + "grad_norm": 0.06668147444725037, + "learning_rate": 6.391802078867304e-05, + "loss": 1.684, + "step": 1019 + }, + { + "epoch": 1.85, + "grad_norm": 0.06579145044088364, + "learning_rate": 6.373877540928972e-05, + "loss": 1.6277, + "step": 1020 + }, + { + "epoch": 1.85, + "grad_norm": 0.06740958243608475, + "learning_rate": 6.355966407100346e-05, + "loss": 1.728, + "step": 1021 + }, + { + "epoch": 1.85, + "grad_norm": 0.07092586159706116, + "learning_rate": 6.338068743590676e-05, + "loss": 1.7091, + "step": 1022 + }, + { + "epoch": 1.85, + "grad_norm": 0.06797771900892258, + "learning_rate": 6.320184616559402e-05, + "loss": 1.6962, + "step": 1023 + }, + { + "epoch": 1.85, + "grad_norm": 0.06833136081695557, + "learning_rate": 6.30231409211593e-05, + "loss": 1.6981, + "step": 1024 + }, + { + "epoch": 1.86, + "grad_norm": 0.06703907996416092, + "learning_rate": 6.284457236319381e-05, + "loss": 1.7082, + "step": 1025 + }, + { + "epoch": 1.86, + "grad_norm": 0.0666668489575386, + "learning_rate": 6.266614115178351e-05, + "loss": 1.6198, + "step": 1026 + }, + { + "epoch": 1.86, + "grad_norm": 0.07242632657289505, + "learning_rate": 6.248784794650672e-05, + "loss": 1.705, + "step": 1027 + }, + { + "epoch": 1.86, + "grad_norm": 0.06651555746793747, + "learning_rate": 6.230969340643149e-05, + "loss": 1.6417, + "step": 1028 + }, + { + "epoch": 1.86, + "grad_norm": 0.06552428007125854, + "learning_rate": 6.213167819011338e-05, + "loss": 1.6917, + "step": 1029 + }, + { + "epoch": 1.87, + "grad_norm": 0.06741311401128769, + "learning_rate": 6.195380295559288e-05, + "loss": 1.7241, + "step": 1030 + }, + { + "epoch": 1.87, + "grad_norm": 0.06656550616025925, + "learning_rate": 6.177606836039311e-05, + "loss": 1.646, + "step": 1031 + }, + { + "epoch": 1.87, + "grad_norm": 0.06896986067295074, + "learning_rate": 6.159847506151719e-05, + "loss": 1.6708, + "step": 1032 + }, + { + "epoch": 1.87, + "grad_norm": 0.06811494380235672, + "learning_rate": 6.142102371544604e-05, + "loss": 1.6927, + "step": 1033 + }, + { + "epoch": 1.87, + "grad_norm": 0.06616541743278503, + "learning_rate": 6.124371497813582e-05, + "loss": 1.6175, + "step": 1034 + }, + { + "epoch": 1.87, + "grad_norm": 0.06697241216897964, + "learning_rate": 6.106654950501547e-05, + "loss": 1.6848, + "step": 1035 + }, + { + "epoch": 1.88, + "grad_norm": 0.06779171526432037, + "learning_rate": 6.0889527950984416e-05, + "loss": 1.6566, + "step": 1036 + }, + { + "epoch": 1.88, + "grad_norm": 0.0683891773223877, + "learning_rate": 6.071265097041005e-05, + "loss": 1.6258, + "step": 1037 + }, + { + "epoch": 1.88, + "grad_norm": 0.06936081498861313, + "learning_rate": 6.053591921712541e-05, + "loss": 1.6115, + "step": 1038 + }, + { + "epoch": 1.88, + "grad_norm": 0.0856877937912941, + "learning_rate": 6.035933334442654e-05, + "loss": 1.6742, + "step": 1039 + }, + { + "epoch": 1.88, + "grad_norm": 0.07240041345357895, + "learning_rate": 6.01828940050704e-05, + "loss": 1.6901, + "step": 1040 + }, + { + "epoch": 1.89, + "grad_norm": 0.0770583376288414, + "learning_rate": 6.000660185127219e-05, + "loss": 1.6803, + "step": 1041 + }, + { + "epoch": 1.89, + "grad_norm": 0.06806863099336624, + "learning_rate": 5.983045753470308e-05, + "loss": 1.6561, + "step": 1042 + }, + { + "epoch": 1.89, + "grad_norm": 0.06816756725311279, + "learning_rate": 5.965446170648765e-05, + "loss": 1.6635, + "step": 1043 + }, + { + "epoch": 1.89, + "grad_norm": 0.06543378531932831, + "learning_rate": 5.947861501720175e-05, + "loss": 1.7153, + "step": 1044 + }, + { + "epoch": 1.89, + "grad_norm": 0.06688012927770615, + "learning_rate": 5.930291811686983e-05, + "loss": 1.7142, + "step": 1045 + }, + { + "epoch": 1.89, + "grad_norm": 0.071477010846138, + "learning_rate": 5.9127371654962615e-05, + "loss": 1.6804, + "step": 1046 + }, + { + "epoch": 1.9, + "grad_norm": 0.06843505799770355, + "learning_rate": 5.8951976280394795e-05, + "loss": 1.7476, + "step": 1047 + }, + { + "epoch": 1.9, + "grad_norm": 0.06697747856378555, + "learning_rate": 5.8776732641522503e-05, + "loss": 1.662, + "step": 1048 + }, + { + "epoch": 1.9, + "grad_norm": 0.06771202385425568, + "learning_rate": 5.86016413861411e-05, + "loss": 1.655, + "step": 1049 + }, + { + "epoch": 1.9, + "grad_norm": 0.07092612236738205, + "learning_rate": 5.842670316148244e-05, + "loss": 1.707, + "step": 1050 + }, + { + "epoch": 1.9, + "grad_norm": 0.06740372627973557, + "learning_rate": 5.825191861421285e-05, + "loss": 1.673, + "step": 1051 + }, + { + "epoch": 1.91, + "grad_norm": 0.06587556004524231, + "learning_rate": 5.807728839043061e-05, + "loss": 1.6879, + "step": 1052 + }, + { + "epoch": 1.91, + "grad_norm": 0.06834732741117477, + "learning_rate": 5.790281313566341e-05, + "loss": 1.7233, + "step": 1053 + }, + { + "epoch": 1.91, + "grad_norm": 0.06691209226846695, + "learning_rate": 5.7728493494866134e-05, + "loss": 1.6966, + "step": 1054 + }, + { + "epoch": 1.91, + "grad_norm": 0.06715382635593414, + "learning_rate": 5.755433011241851e-05, + "loss": 1.7185, + "step": 1055 + }, + { + "epoch": 1.91, + "grad_norm": 0.06831709295511246, + "learning_rate": 5.738032363212258e-05, + "loss": 1.6529, + "step": 1056 + }, + { + "epoch": 1.91, + "grad_norm": 0.06592843681573868, + "learning_rate": 5.720647469720033e-05, + "loss": 1.6939, + "step": 1057 + }, + { + "epoch": 1.92, + "grad_norm": 0.06575801223516464, + "learning_rate": 5.70327839502915e-05, + "loss": 1.6642, + "step": 1058 + }, + { + "epoch": 1.92, + "grad_norm": 0.07193956524133682, + "learning_rate": 5.685925203345108e-05, + "loss": 1.6675, + "step": 1059 + }, + { + "epoch": 1.92, + "grad_norm": 0.0670444443821907, + "learning_rate": 5.6685879588146815e-05, + "loss": 1.7136, + "step": 1060 + }, + { + "epoch": 1.92, + "grad_norm": 0.07206844538450241, + "learning_rate": 5.651266725525703e-05, + "loss": 1.6999, + "step": 1061 + }, + { + "epoch": 1.92, + "grad_norm": 0.0692375898361206, + "learning_rate": 5.633961567506819e-05, + "loss": 1.6782, + "step": 1062 + }, + { + "epoch": 1.93, + "grad_norm": 0.06483175605535507, + "learning_rate": 5.6166725487272576e-05, + "loss": 1.6448, + "step": 1063 + }, + { + "epoch": 1.93, + "grad_norm": 0.0667993351817131, + "learning_rate": 5.5993997330965796e-05, + "loss": 1.6683, + "step": 1064 + }, + { + "epoch": 1.93, + "grad_norm": 0.0673048198223114, + "learning_rate": 5.5821431844644476e-05, + "loss": 1.6534, + "step": 1065 + }, + { + "epoch": 1.93, + "grad_norm": 0.07212254405021667, + "learning_rate": 5.564902966620408e-05, + "loss": 1.7084, + "step": 1066 + }, + { + "epoch": 1.93, + "grad_norm": 0.06697355955839157, + "learning_rate": 5.547679143293624e-05, + "loss": 1.7029, + "step": 1067 + }, + { + "epoch": 1.93, + "grad_norm": 0.07669904828071594, + "learning_rate": 5.530471778152658e-05, + "loss": 1.7153, + "step": 1068 + }, + { + "epoch": 1.94, + "grad_norm": 0.07381530106067657, + "learning_rate": 5.513280934805243e-05, + "loss": 1.6769, + "step": 1069 + }, + { + "epoch": 1.94, + "grad_norm": 0.068946473300457, + "learning_rate": 5.4961066767980363e-05, + "loss": 1.6799, + "step": 1070 + }, + { + "epoch": 1.94, + "grad_norm": 0.06763108819723129, + "learning_rate": 5.478949067616381e-05, + "loss": 1.7185, + "step": 1071 + }, + { + "epoch": 1.94, + "grad_norm": 0.06624120473861694, + "learning_rate": 5.4618081706840754e-05, + "loss": 1.6972, + "step": 1072 + }, + { + "epoch": 1.94, + "grad_norm": 0.06670323014259338, + "learning_rate": 5.444684049363147e-05, + "loss": 1.6826, + "step": 1073 + }, + { + "epoch": 1.95, + "grad_norm": 0.06699904054403305, + "learning_rate": 5.4275767669536146e-05, + "loss": 1.643, + "step": 1074 + }, + { + "epoch": 1.95, + "grad_norm": 0.07036450505256653, + "learning_rate": 5.410486386693243e-05, + "loss": 1.6719, + "step": 1075 + }, + { + "epoch": 1.95, + "grad_norm": 0.06482276320457458, + "learning_rate": 5.3934129717573165e-05, + "loss": 1.6756, + "step": 1076 + }, + { + "epoch": 1.95, + "grad_norm": 0.06716746836900711, + "learning_rate": 5.3763565852584177e-05, + "loss": 1.6995, + "step": 1077 + }, + { + "epoch": 1.95, + "grad_norm": 0.06743574887514114, + "learning_rate": 5.3593172902461717e-05, + "loss": 1.7064, + "step": 1078 + }, + { + "epoch": 1.95, + "grad_norm": 0.06770848482847214, + "learning_rate": 5.342295149707025e-05, + "loss": 1.6588, + "step": 1079 + }, + { + "epoch": 1.96, + "grad_norm": 0.06666205823421478, + "learning_rate": 5.325290226564017e-05, + "loss": 1.6215, + "step": 1080 + }, + { + "epoch": 1.96, + "grad_norm": 0.0728970617055893, + "learning_rate": 5.308302583676548e-05, + "loss": 1.6878, + "step": 1081 + }, + { + "epoch": 1.96, + "grad_norm": 0.06758435070514679, + "learning_rate": 5.291332283840125e-05, + "loss": 1.6422, + "step": 1082 + }, + { + "epoch": 1.96, + "grad_norm": 0.06901335716247559, + "learning_rate": 5.274379389786154e-05, + "loss": 1.7208, + "step": 1083 + }, + { + "epoch": 1.96, + "grad_norm": 0.06578974425792694, + "learning_rate": 5.2574439641817006e-05, + "loss": 1.6822, + "step": 1084 + }, + { + "epoch": 1.97, + "grad_norm": 0.08507327735424042, + "learning_rate": 5.240526069629265e-05, + "loss": 1.6697, + "step": 1085 + }, + { + "epoch": 1.97, + "grad_norm": 0.06818517297506332, + "learning_rate": 5.223625768666528e-05, + "loss": 1.7514, + "step": 1086 + }, + { + "epoch": 1.97, + "grad_norm": 0.06869194656610489, + "learning_rate": 5.206743123766139e-05, + "loss": 1.6667, + "step": 1087 + }, + { + "epoch": 1.97, + "grad_norm": 0.06622481346130371, + "learning_rate": 5.1898781973354914e-05, + "loss": 1.6807, + "step": 1088 + }, + { + "epoch": 1.97, + "grad_norm": 0.07047388702630997, + "learning_rate": 5.173031051716472e-05, + "loss": 1.7118, + "step": 1089 + }, + { + "epoch": 1.97, + "grad_norm": 0.0671396255493164, + "learning_rate": 5.1562017491852387e-05, + "loss": 1.641, + "step": 1090 + }, + { + "epoch": 1.98, + "grad_norm": 0.06699879467487335, + "learning_rate": 5.139390351951997e-05, + "loss": 1.689, + "step": 1091 + }, + { + "epoch": 1.98, + "grad_norm": 0.06538563221693039, + "learning_rate": 5.122596922160768e-05, + "loss": 1.6552, + "step": 1092 + }, + { + "epoch": 1.98, + "grad_norm": 0.06701681017875671, + "learning_rate": 5.105821521889147e-05, + "loss": 1.6229, + "step": 1093 + }, + { + "epoch": 1.98, + "grad_norm": 0.06672403961420059, + "learning_rate": 5.089064213148082e-05, + "loss": 1.695, + "step": 1094 + }, + { + "epoch": 1.98, + "grad_norm": 0.06800191104412079, + "learning_rate": 5.0723250578816576e-05, + "loss": 1.6773, + "step": 1095 + }, + { + "epoch": 1.99, + "grad_norm": 0.066898874938488, + "learning_rate": 5.0556041179668354e-05, + "loss": 1.6562, + "step": 1096 + }, + { + "epoch": 1.99, + "eval_loss": 1.7255171537399292, + "eval_runtime": 76.5349, + "eval_samples_per_second": 65.33, + "eval_steps_per_second": 16.332, + "step": 1096 + } + ], + "logging_steps": 1, + "max_steps": 1644, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 548, + "total_flos": 3.2705098222896415e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}