diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,41321 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999152183128445, + "eval_steps": 500, + "global_step": 5897, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001695633743111488, + "grad_norm": 8.009743100188633, + "learning_rate": 1.1299435028248588e-07, + "loss": 1.6644, + "step": 1 + }, + { + "epoch": 0.0003391267486222976, + "grad_norm": 8.120057966565689, + "learning_rate": 2.2598870056497177e-07, + "loss": 1.654, + "step": 2 + }, + { + "epoch": 0.0005086901229334464, + "grad_norm": 9.015060821761788, + "learning_rate": 3.3898305084745766e-07, + "loss": 1.7052, + "step": 3 + }, + { + "epoch": 0.0006782534972445952, + "grad_norm": 8.497781543486818, + "learning_rate": 4.5197740112994353e-07, + "loss": 1.6626, + "step": 4 + }, + { + "epoch": 0.000847816871555744, + "grad_norm": 7.334467732528096, + "learning_rate": 5.649717514124295e-07, + "loss": 1.6283, + "step": 5 + }, + { + "epoch": 0.0010173802458668928, + "grad_norm": 7.352918044177938, + "learning_rate": 6.779661016949153e-07, + "loss": 1.6305, + "step": 6 + }, + { + "epoch": 0.0011869436201780415, + "grad_norm": 5.8656279822628425, + "learning_rate": 7.909604519774013e-07, + "loss": 1.5736, + "step": 7 + }, + { + "epoch": 0.0013565069944891904, + "grad_norm": 7.660884079016951, + "learning_rate": 9.039548022598871e-07, + "loss": 1.6539, + "step": 8 + }, + { + "epoch": 0.001526070368800339, + "grad_norm": 5.833344012009003, + "learning_rate": 1.016949152542373e-06, + "loss": 1.6351, + "step": 9 + }, + { + "epoch": 0.001695633743111488, + "grad_norm": 4.43504014266901, + "learning_rate": 1.129943502824859e-06, + "loss": 1.5749, + "step": 10 + }, + { + "epoch": 0.0018651971174226368, + "grad_norm": 6.305339473012175, + "learning_rate": 1.2429378531073449e-06, + "loss": 1.6133, + "step": 11 + }, + { + "epoch": 0.0020347604917337857, + "grad_norm": 5.444325833139642, + "learning_rate": 1.3559322033898307e-06, + "loss": 1.5851, + "step": 12 + }, + { + "epoch": 0.0022043238660449343, + "grad_norm": 5.920785386190525, + "learning_rate": 1.4689265536723166e-06, + "loss": 1.5609, + "step": 13 + }, + { + "epoch": 0.002373887240356083, + "grad_norm": 3.784131296672084, + "learning_rate": 1.5819209039548026e-06, + "loss": 1.4846, + "step": 14 + }, + { + "epoch": 0.002543450614667232, + "grad_norm": 5.108565414877564, + "learning_rate": 1.6949152542372882e-06, + "loss": 1.4518, + "step": 15 + }, + { + "epoch": 0.0027130139889783808, + "grad_norm": 4.659212542594555, + "learning_rate": 1.8079096045197741e-06, + "loss": 1.4776, + "step": 16 + }, + { + "epoch": 0.0028825773632895294, + "grad_norm": 4.500699647023348, + "learning_rate": 1.92090395480226e-06, + "loss": 1.4471, + "step": 17 + }, + { + "epoch": 0.003052140737600678, + "grad_norm": 4.359881151805078, + "learning_rate": 2.033898305084746e-06, + "loss": 1.4122, + "step": 18 + }, + { + "epoch": 0.003221704111911827, + "grad_norm": 2.9457532035155882, + "learning_rate": 2.146892655367232e-06, + "loss": 1.3402, + "step": 19 + }, + { + "epoch": 0.003391267486222976, + "grad_norm": 2.3289467248402285, + "learning_rate": 2.259887005649718e-06, + "loss": 1.3148, + "step": 20 + }, + { + "epoch": 0.0035608308605341245, + "grad_norm": 2.129911559733815, + "learning_rate": 2.372881355932204e-06, + "loss": 1.3119, + "step": 21 + }, + { + "epoch": 0.0037303942348452736, + "grad_norm": 1.9608178460181482, + "learning_rate": 2.4858757062146898e-06, + "loss": 1.3232, + "step": 22 + }, + { + "epoch": 0.0038999576091564223, + "grad_norm": 2.0375618273846356, + "learning_rate": 2.5988700564971753e-06, + "loss": 1.2713, + "step": 23 + }, + { + "epoch": 0.004069520983467571, + "grad_norm": 1.793874599018799, + "learning_rate": 2.7118644067796613e-06, + "loss": 1.288, + "step": 24 + }, + { + "epoch": 0.00423908435777872, + "grad_norm": 2.004063156040502, + "learning_rate": 2.8248587570621473e-06, + "loss": 1.2909, + "step": 25 + }, + { + "epoch": 0.004408647732089869, + "grad_norm": 1.9905323823337757, + "learning_rate": 2.9378531073446333e-06, + "loss": 1.3221, + "step": 26 + }, + { + "epoch": 0.004578211106401018, + "grad_norm": 2.005543582552709, + "learning_rate": 3.0508474576271192e-06, + "loss": 1.317, + "step": 27 + }, + { + "epoch": 0.004747774480712166, + "grad_norm": 1.8209655307101105, + "learning_rate": 3.163841807909605e-06, + "loss": 1.2674, + "step": 28 + }, + { + "epoch": 0.004917337855023315, + "grad_norm": 1.9415017758256143, + "learning_rate": 3.2768361581920903e-06, + "loss": 1.3413, + "step": 29 + }, + { + "epoch": 0.005086901229334464, + "grad_norm": 1.8110879769344066, + "learning_rate": 3.3898305084745763e-06, + "loss": 1.3151, + "step": 30 + }, + { + "epoch": 0.005256464603645612, + "grad_norm": 1.724928822547298, + "learning_rate": 3.5028248587570623e-06, + "loss": 1.2628, + "step": 31 + }, + { + "epoch": 0.0054260279779567615, + "grad_norm": 1.6661841508413606, + "learning_rate": 3.6158192090395483e-06, + "loss": 1.3117, + "step": 32 + }, + { + "epoch": 0.00559559135226791, + "grad_norm": 1.5534064831870749, + "learning_rate": 3.7288135593220342e-06, + "loss": 1.2836, + "step": 33 + }, + { + "epoch": 0.005765154726579059, + "grad_norm": 1.7046938776680567, + "learning_rate": 3.84180790960452e-06, + "loss": 1.285, + "step": 34 + }, + { + "epoch": 0.005934718100890208, + "grad_norm": 1.612846525855917, + "learning_rate": 3.954802259887006e-06, + "loss": 1.2809, + "step": 35 + }, + { + "epoch": 0.006104281475201356, + "grad_norm": 1.687964664224191, + "learning_rate": 4.067796610169492e-06, + "loss": 1.2731, + "step": 36 + }, + { + "epoch": 0.006273844849512505, + "grad_norm": 1.6687480915272355, + "learning_rate": 4.180790960451978e-06, + "loss": 1.2763, + "step": 37 + }, + { + "epoch": 0.006443408223823654, + "grad_norm": 1.4888760594468662, + "learning_rate": 4.293785310734464e-06, + "loss": 1.2137, + "step": 38 + }, + { + "epoch": 0.006612971598134803, + "grad_norm": 1.8803849291882477, + "learning_rate": 4.40677966101695e-06, + "loss": 0.9621, + "step": 39 + }, + { + "epoch": 0.006782534972445952, + "grad_norm": 1.4823709104222176, + "learning_rate": 4.519774011299436e-06, + "loss": 1.2617, + "step": 40 + }, + { + "epoch": 0.006952098346757101, + "grad_norm": 1.4914333671052835, + "learning_rate": 4.632768361581922e-06, + "loss": 1.2139, + "step": 41 + }, + { + "epoch": 0.007121661721068249, + "grad_norm": 1.3974249005302837, + "learning_rate": 4.745762711864408e-06, + "loss": 1.2326, + "step": 42 + }, + { + "epoch": 0.007291225095379398, + "grad_norm": 1.5303511649936188, + "learning_rate": 4.8587570621468936e-06, + "loss": 1.1972, + "step": 43 + }, + { + "epoch": 0.007460788469690547, + "grad_norm": 1.7150929659071426, + "learning_rate": 4.9717514124293796e-06, + "loss": 1.2443, + "step": 44 + }, + { + "epoch": 0.007630351844001695, + "grad_norm": 1.4264128386170565, + "learning_rate": 5.084745762711865e-06, + "loss": 1.2487, + "step": 45 + }, + { + "epoch": 0.0077999152183128445, + "grad_norm": 1.6364577845075876, + "learning_rate": 5.197740112994351e-06, + "loss": 1.2417, + "step": 46 + }, + { + "epoch": 0.007969478592623994, + "grad_norm": 1.4332553821939178, + "learning_rate": 5.310734463276837e-06, + "loss": 1.1973, + "step": 47 + }, + { + "epoch": 0.008139041966935143, + "grad_norm": 1.3592115062353742, + "learning_rate": 5.423728813559323e-06, + "loss": 1.1863, + "step": 48 + }, + { + "epoch": 0.00830860534124629, + "grad_norm": 1.3738333468550614, + "learning_rate": 5.536723163841809e-06, + "loss": 1.1696, + "step": 49 + }, + { + "epoch": 0.00847816871555744, + "grad_norm": 1.4768014324641212, + "learning_rate": 5.6497175141242946e-06, + "loss": 1.245, + "step": 50 + }, + { + "epoch": 0.008647732089868588, + "grad_norm": 1.4675179858171783, + "learning_rate": 5.7627118644067805e-06, + "loss": 1.1864, + "step": 51 + }, + { + "epoch": 0.008817295464179737, + "grad_norm": 1.4910780908907362, + "learning_rate": 5.8757062146892665e-06, + "loss": 1.2223, + "step": 52 + }, + { + "epoch": 0.008986858838490886, + "grad_norm": 1.472831051111739, + "learning_rate": 5.9887005649717525e-06, + "loss": 1.2381, + "step": 53 + }, + { + "epoch": 0.009156422212802036, + "grad_norm": 1.5872347424548234, + "learning_rate": 6.1016949152542385e-06, + "loss": 1.2539, + "step": 54 + }, + { + "epoch": 0.009325985587113183, + "grad_norm": 1.5061858194714863, + "learning_rate": 6.2146892655367244e-06, + "loss": 1.2216, + "step": 55 + }, + { + "epoch": 0.009495548961424332, + "grad_norm": 1.3283158228134124, + "learning_rate": 6.32768361581921e-06, + "loss": 1.1677, + "step": 56 + }, + { + "epoch": 0.009665112335735481, + "grad_norm": 1.4486033239394485, + "learning_rate": 6.440677966101695e-06, + "loss": 1.2341, + "step": 57 + }, + { + "epoch": 0.00983467571004663, + "grad_norm": 1.4316644916597505, + "learning_rate": 6.553672316384181e-06, + "loss": 1.2112, + "step": 58 + }, + { + "epoch": 0.01000423908435778, + "grad_norm": 1.3755463463393165, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1726, + "step": 59 + }, + { + "epoch": 0.010173802458668928, + "grad_norm": 1.426073730303385, + "learning_rate": 6.779661016949153e-06, + "loss": 1.1728, + "step": 60 + }, + { + "epoch": 0.010343365832980076, + "grad_norm": 1.4224049111949, + "learning_rate": 6.892655367231639e-06, + "loss": 1.2123, + "step": 61 + }, + { + "epoch": 0.010512929207291225, + "grad_norm": 1.41522704459736, + "learning_rate": 7.0056497175141246e-06, + "loss": 1.1711, + "step": 62 + }, + { + "epoch": 0.010682492581602374, + "grad_norm": 1.3791845008388997, + "learning_rate": 7.1186440677966106e-06, + "loss": 1.1764, + "step": 63 + }, + { + "epoch": 0.010852055955913523, + "grad_norm": 1.4115609164022564, + "learning_rate": 7.2316384180790965e-06, + "loss": 1.1833, + "step": 64 + }, + { + "epoch": 0.011021619330224672, + "grad_norm": 1.49155460296025, + "learning_rate": 7.3446327683615825e-06, + "loss": 1.2249, + "step": 65 + }, + { + "epoch": 0.01119118270453582, + "grad_norm": 1.4550519804646345, + "learning_rate": 7.4576271186440685e-06, + "loss": 1.1723, + "step": 66 + }, + { + "epoch": 0.011360746078846969, + "grad_norm": 1.3630110124605177, + "learning_rate": 7.5706214689265545e-06, + "loss": 1.1441, + "step": 67 + }, + { + "epoch": 0.011530309453158118, + "grad_norm": 1.3421295802070998, + "learning_rate": 7.68361581920904e-06, + "loss": 1.1751, + "step": 68 + }, + { + "epoch": 0.011699872827469267, + "grad_norm": 1.4571892679954532, + "learning_rate": 7.796610169491526e-06, + "loss": 1.179, + "step": 69 + }, + { + "epoch": 0.011869436201780416, + "grad_norm": 1.5214067362659185, + "learning_rate": 7.909604519774012e-06, + "loss": 1.204, + "step": 70 + }, + { + "epoch": 0.012038999576091565, + "grad_norm": 1.4484248517716496, + "learning_rate": 8.022598870056498e-06, + "loss": 1.1954, + "step": 71 + }, + { + "epoch": 0.012208562950402712, + "grad_norm": 1.5414616013254128, + "learning_rate": 8.135593220338983e-06, + "loss": 1.1514, + "step": 72 + }, + { + "epoch": 0.012378126324713861, + "grad_norm": 1.5454130785927713, + "learning_rate": 8.248587570621469e-06, + "loss": 1.1606, + "step": 73 + }, + { + "epoch": 0.01254768969902501, + "grad_norm": 1.34697348323352, + "learning_rate": 8.361581920903955e-06, + "loss": 1.1995, + "step": 74 + }, + { + "epoch": 0.01271725307333616, + "grad_norm": 1.3774590095095847, + "learning_rate": 8.47457627118644e-06, + "loss": 1.2001, + "step": 75 + }, + { + "epoch": 0.012886816447647309, + "grad_norm": 1.3143471880877364, + "learning_rate": 8.587570621468927e-06, + "loss": 1.1454, + "step": 76 + }, + { + "epoch": 0.013056379821958458, + "grad_norm": 1.375366668851842, + "learning_rate": 8.700564971751413e-06, + "loss": 1.1438, + "step": 77 + }, + { + "epoch": 0.013225943196269605, + "grad_norm": 1.441944270436558, + "learning_rate": 8.8135593220339e-06, + "loss": 1.1993, + "step": 78 + }, + { + "epoch": 0.013395506570580754, + "grad_norm": 1.2946879133856595, + "learning_rate": 8.926553672316384e-06, + "loss": 1.1533, + "step": 79 + }, + { + "epoch": 0.013565069944891903, + "grad_norm": 1.4715500438889428, + "learning_rate": 9.039548022598871e-06, + "loss": 1.2228, + "step": 80 + }, + { + "epoch": 0.013734633319203052, + "grad_norm": 1.3443740933148, + "learning_rate": 9.152542372881356e-06, + "loss": 1.1242, + "step": 81 + }, + { + "epoch": 0.013904196693514202, + "grad_norm": 1.4263901354504167, + "learning_rate": 9.265536723163843e-06, + "loss": 1.1791, + "step": 82 + }, + { + "epoch": 0.014073760067825349, + "grad_norm": 1.382684270849743, + "learning_rate": 9.378531073446328e-06, + "loss": 1.186, + "step": 83 + }, + { + "epoch": 0.014243323442136498, + "grad_norm": 1.24687716463306, + "learning_rate": 9.491525423728815e-06, + "loss": 1.165, + "step": 84 + }, + { + "epoch": 0.014412886816447647, + "grad_norm": 1.4046061864477641, + "learning_rate": 9.6045197740113e-06, + "loss": 1.1556, + "step": 85 + }, + { + "epoch": 0.014582450190758796, + "grad_norm": 1.4201710759611585, + "learning_rate": 9.717514124293787e-06, + "loss": 1.2076, + "step": 86 + }, + { + "epoch": 0.014752013565069945, + "grad_norm": 1.4052705160068086, + "learning_rate": 9.830508474576272e-06, + "loss": 1.1808, + "step": 87 + }, + { + "epoch": 0.014921576939381094, + "grad_norm": 1.299029989336803, + "learning_rate": 9.943502824858759e-06, + "loss": 1.1539, + "step": 88 + }, + { + "epoch": 0.015091140313692242, + "grad_norm": 1.4569947609058904, + "learning_rate": 1.0056497175141244e-05, + "loss": 1.1797, + "step": 89 + }, + { + "epoch": 0.01526070368800339, + "grad_norm": 1.4143057551195373, + "learning_rate": 1.016949152542373e-05, + "loss": 1.1338, + "step": 90 + }, + { + "epoch": 0.01543026706231454, + "grad_norm": 1.3602375190007165, + "learning_rate": 1.0282485875706216e-05, + "loss": 1.1706, + "step": 91 + }, + { + "epoch": 0.015599830436625689, + "grad_norm": 1.5644532153484685, + "learning_rate": 1.0395480225988701e-05, + "loss": 1.2033, + "step": 92 + }, + { + "epoch": 0.015769393810936838, + "grad_norm": 1.3546546367249224, + "learning_rate": 1.0508474576271188e-05, + "loss": 1.1561, + "step": 93 + }, + { + "epoch": 0.015938957185247987, + "grad_norm": 1.3725561300593538, + "learning_rate": 1.0621468926553673e-05, + "loss": 1.1325, + "step": 94 + }, + { + "epoch": 0.016108520559559136, + "grad_norm": 1.388499463375127, + "learning_rate": 1.073446327683616e-05, + "loss": 1.1288, + "step": 95 + }, + { + "epoch": 0.016278083933870285, + "grad_norm": 1.4526522323723448, + "learning_rate": 1.0847457627118645e-05, + "loss": 1.1949, + "step": 96 + }, + { + "epoch": 0.016447647308181435, + "grad_norm": 1.3638352643302796, + "learning_rate": 1.096045197740113e-05, + "loss": 1.1829, + "step": 97 + }, + { + "epoch": 0.01661721068249258, + "grad_norm": 1.33892437598582, + "learning_rate": 1.1073446327683617e-05, + "loss": 1.1647, + "step": 98 + }, + { + "epoch": 0.01678677405680373, + "grad_norm": 1.3625780186000709, + "learning_rate": 1.1186440677966102e-05, + "loss": 1.1446, + "step": 99 + }, + { + "epoch": 0.01695633743111488, + "grad_norm": 1.4666896724472283, + "learning_rate": 1.1299435028248589e-05, + "loss": 1.1551, + "step": 100 + }, + { + "epoch": 0.017125900805426027, + "grad_norm": 1.418947637976201, + "learning_rate": 1.1412429378531074e-05, + "loss": 1.1432, + "step": 101 + }, + { + "epoch": 0.017295464179737177, + "grad_norm": 1.4582583979962045, + "learning_rate": 1.1525423728813561e-05, + "loss": 1.1372, + "step": 102 + }, + { + "epoch": 0.017465027554048326, + "grad_norm": 1.5049126363000913, + "learning_rate": 1.1638418079096046e-05, + "loss": 1.1349, + "step": 103 + }, + { + "epoch": 0.017634590928359475, + "grad_norm": 1.3732226340301854, + "learning_rate": 1.1751412429378533e-05, + "loss": 1.0922, + "step": 104 + }, + { + "epoch": 0.017804154302670624, + "grad_norm": 1.3539434227531597, + "learning_rate": 1.1864406779661018e-05, + "loss": 1.1458, + "step": 105 + }, + { + "epoch": 0.017973717676981773, + "grad_norm": 1.4197267694015603, + "learning_rate": 1.1977401129943505e-05, + "loss": 1.1086, + "step": 106 + }, + { + "epoch": 0.018143281051292922, + "grad_norm": 1.438729038046718, + "learning_rate": 1.209039548022599e-05, + "loss": 1.0988, + "step": 107 + }, + { + "epoch": 0.01831284442560407, + "grad_norm": 1.633758894428501, + "learning_rate": 1.2203389830508477e-05, + "loss": 1.1674, + "step": 108 + }, + { + "epoch": 0.018482407799915217, + "grad_norm": 1.357554823293788, + "learning_rate": 1.2316384180790962e-05, + "loss": 1.1606, + "step": 109 + }, + { + "epoch": 0.018651971174226366, + "grad_norm": 1.618537712329157, + "learning_rate": 1.2429378531073449e-05, + "loss": 1.1679, + "step": 110 + }, + { + "epoch": 0.018821534548537515, + "grad_norm": 1.474783316466239, + "learning_rate": 1.2542372881355932e-05, + "loss": 1.1391, + "step": 111 + }, + { + "epoch": 0.018991097922848664, + "grad_norm": 1.4025286069410725, + "learning_rate": 1.265536723163842e-05, + "loss": 1.1741, + "step": 112 + }, + { + "epoch": 0.019160661297159813, + "grad_norm": 1.4202582137753894, + "learning_rate": 1.2768361581920904e-05, + "loss": 1.1488, + "step": 113 + }, + { + "epoch": 0.019330224671470962, + "grad_norm": 1.3729347919531196, + "learning_rate": 1.288135593220339e-05, + "loss": 1.1339, + "step": 114 + }, + { + "epoch": 0.01949978804578211, + "grad_norm": 1.4684487410646836, + "learning_rate": 1.2994350282485876e-05, + "loss": 1.1968, + "step": 115 + }, + { + "epoch": 0.01966935142009326, + "grad_norm": 1.389502113861768, + "learning_rate": 1.3107344632768361e-05, + "loss": 1.1839, + "step": 116 + }, + { + "epoch": 0.01983891479440441, + "grad_norm": 1.4360527100049258, + "learning_rate": 1.3220338983050848e-05, + "loss": 1.1367, + "step": 117 + }, + { + "epoch": 0.02000847816871556, + "grad_norm": 1.418750605257581, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.1746, + "step": 118 + }, + { + "epoch": 0.020178041543026708, + "grad_norm": 1.549593645625235, + "learning_rate": 1.344632768361582e-05, + "loss": 1.2052, + "step": 119 + }, + { + "epoch": 0.020347604917337857, + "grad_norm": 1.439169982956848, + "learning_rate": 1.3559322033898305e-05, + "loss": 1.1231, + "step": 120 + }, + { + "epoch": 0.020517168291649002, + "grad_norm": 1.5599134557216332, + "learning_rate": 1.3672316384180792e-05, + "loss": 1.1337, + "step": 121 + }, + { + "epoch": 0.02068673166596015, + "grad_norm": 1.3808742250679915, + "learning_rate": 1.3785310734463277e-05, + "loss": 1.1385, + "step": 122 + }, + { + "epoch": 0.0208562950402713, + "grad_norm": 1.4567621392474348, + "learning_rate": 1.3898305084745764e-05, + "loss": 1.1281, + "step": 123 + }, + { + "epoch": 0.02102585841458245, + "grad_norm": 1.4210796371692003, + "learning_rate": 1.4011299435028249e-05, + "loss": 1.1479, + "step": 124 + }, + { + "epoch": 0.0211954217888936, + "grad_norm": 1.2766383464713438, + "learning_rate": 1.4124293785310736e-05, + "loss": 1.1319, + "step": 125 + }, + { + "epoch": 0.021364985163204748, + "grad_norm": 1.4041737686093716, + "learning_rate": 1.4237288135593221e-05, + "loss": 1.1349, + "step": 126 + }, + { + "epoch": 0.021534548537515897, + "grad_norm": 1.4698887894871435, + "learning_rate": 1.4350282485875708e-05, + "loss": 1.1594, + "step": 127 + }, + { + "epoch": 0.021704111911827046, + "grad_norm": 1.2737750466404523, + "learning_rate": 1.4463276836158193e-05, + "loss": 1.0648, + "step": 128 + }, + { + "epoch": 0.021873675286138195, + "grad_norm": 1.4613251955693345, + "learning_rate": 1.4576271186440678e-05, + "loss": 1.1833, + "step": 129 + }, + { + "epoch": 0.022043238660449344, + "grad_norm": 1.358530679449396, + "learning_rate": 1.4689265536723165e-05, + "loss": 1.0942, + "step": 130 + }, + { + "epoch": 0.022212802034760493, + "grad_norm": 1.4911629429567874, + "learning_rate": 1.480225988700565e-05, + "loss": 1.1287, + "step": 131 + }, + { + "epoch": 0.02238236540907164, + "grad_norm": 1.3954468734206023, + "learning_rate": 1.4915254237288137e-05, + "loss": 1.1282, + "step": 132 + }, + { + "epoch": 0.022551928783382788, + "grad_norm": 1.362654148285321, + "learning_rate": 1.5028248587570622e-05, + "loss": 1.1385, + "step": 133 + }, + { + "epoch": 0.022721492157693937, + "grad_norm": 1.3510183744777187, + "learning_rate": 1.5141242937853109e-05, + "loss": 1.1401, + "step": 134 + }, + { + "epoch": 0.022891055532005086, + "grad_norm": 1.3443832259921455, + "learning_rate": 1.5254237288135594e-05, + "loss": 1.1301, + "step": 135 + }, + { + "epoch": 0.023060618906316235, + "grad_norm": 1.4729900026203113, + "learning_rate": 1.536723163841808e-05, + "loss": 1.1572, + "step": 136 + }, + { + "epoch": 0.023230182280627384, + "grad_norm": 1.5192378566670621, + "learning_rate": 1.5480225988700566e-05, + "loss": 1.1121, + "step": 137 + }, + { + "epoch": 0.023399745654938534, + "grad_norm": 1.3371938427826509, + "learning_rate": 1.5593220338983053e-05, + "loss": 1.1023, + "step": 138 + }, + { + "epoch": 0.023569309029249683, + "grad_norm": 1.3506912312259471, + "learning_rate": 1.5706214689265536e-05, + "loss": 1.1164, + "step": 139 + }, + { + "epoch": 0.02373887240356083, + "grad_norm": 1.4136938287610317, + "learning_rate": 1.5819209039548023e-05, + "loss": 1.097, + "step": 140 + }, + { + "epoch": 0.02390843577787198, + "grad_norm": 1.488625382024975, + "learning_rate": 1.593220338983051e-05, + "loss": 1.145, + "step": 141 + }, + { + "epoch": 0.02407799915218313, + "grad_norm": 1.3004109068927645, + "learning_rate": 1.6045197740112997e-05, + "loss": 1.1451, + "step": 142 + }, + { + "epoch": 0.024247562526494276, + "grad_norm": 1.576383048933645, + "learning_rate": 1.615819209039548e-05, + "loss": 1.1522, + "step": 143 + }, + { + "epoch": 0.024417125900805425, + "grad_norm": 1.5103087194461904, + "learning_rate": 1.6271186440677967e-05, + "loss": 1.0823, + "step": 144 + }, + { + "epoch": 0.024586689275116574, + "grad_norm": 1.3086214235359925, + "learning_rate": 1.6384180790960454e-05, + "loss": 1.0841, + "step": 145 + }, + { + "epoch": 0.024756252649427723, + "grad_norm": 1.5049556210751096, + "learning_rate": 1.6497175141242937e-05, + "loss": 1.1514, + "step": 146 + }, + { + "epoch": 0.024925816023738872, + "grad_norm": 1.5829729386310978, + "learning_rate": 1.6610169491525424e-05, + "loss": 1.1277, + "step": 147 + }, + { + "epoch": 0.02509537939805002, + "grad_norm": 1.308951750399878, + "learning_rate": 1.672316384180791e-05, + "loss": 1.0862, + "step": 148 + }, + { + "epoch": 0.02526494277236117, + "grad_norm": 1.431271306611669, + "learning_rate": 1.6836158192090398e-05, + "loss": 1.1225, + "step": 149 + }, + { + "epoch": 0.02543450614667232, + "grad_norm": 1.43200498985713, + "learning_rate": 1.694915254237288e-05, + "loss": 1.1142, + "step": 150 + }, + { + "epoch": 0.02560406952098347, + "grad_norm": 1.3977923715266523, + "learning_rate": 1.7062146892655368e-05, + "loss": 1.0992, + "step": 151 + }, + { + "epoch": 0.025773632895294617, + "grad_norm": 1.316473925950156, + "learning_rate": 1.7175141242937855e-05, + "loss": 1.1106, + "step": 152 + }, + { + "epoch": 0.025943196269605766, + "grad_norm": 1.2750052619424626, + "learning_rate": 1.728813559322034e-05, + "loss": 1.0955, + "step": 153 + }, + { + "epoch": 0.026112759643916916, + "grad_norm": 1.355255064360889, + "learning_rate": 1.7401129943502825e-05, + "loss": 1.0781, + "step": 154 + }, + { + "epoch": 0.02628232301822806, + "grad_norm": 1.5199288561924749, + "learning_rate": 1.7514124293785312e-05, + "loss": 1.1173, + "step": 155 + }, + { + "epoch": 0.02645188639253921, + "grad_norm": 1.5301032064436257, + "learning_rate": 1.76271186440678e-05, + "loss": 1.1632, + "step": 156 + }, + { + "epoch": 0.02662144976685036, + "grad_norm": 1.351533994824582, + "learning_rate": 1.7740112994350286e-05, + "loss": 1.1359, + "step": 157 + }, + { + "epoch": 0.02679101314116151, + "grad_norm": 1.334637742315722, + "learning_rate": 1.785310734463277e-05, + "loss": 1.1421, + "step": 158 + }, + { + "epoch": 0.026960576515472658, + "grad_norm": 1.5032188427159412, + "learning_rate": 1.7966101694915256e-05, + "loss": 1.0946, + "step": 159 + }, + { + "epoch": 0.027130139889783807, + "grad_norm": 1.4557110103401063, + "learning_rate": 1.8079096045197743e-05, + "loss": 1.1207, + "step": 160 + }, + { + "epoch": 0.027299703264094956, + "grad_norm": 1.3679392208202088, + "learning_rate": 1.8192090395480226e-05, + "loss": 1.0915, + "step": 161 + }, + { + "epoch": 0.027469266638406105, + "grad_norm": 1.3152214063562153, + "learning_rate": 1.8305084745762713e-05, + "loss": 1.1025, + "step": 162 + }, + { + "epoch": 0.027638830012717254, + "grad_norm": 1.4785040033145542, + "learning_rate": 1.84180790960452e-05, + "loss": 1.1513, + "step": 163 + }, + { + "epoch": 0.027808393387028403, + "grad_norm": 1.6818977355061948, + "learning_rate": 1.8531073446327686e-05, + "loss": 1.1426, + "step": 164 + }, + { + "epoch": 0.027977956761339552, + "grad_norm": 1.2546223970138515, + "learning_rate": 1.864406779661017e-05, + "loss": 1.0774, + "step": 165 + }, + { + "epoch": 0.028147520135650698, + "grad_norm": 1.3944074900148387, + "learning_rate": 1.8757062146892657e-05, + "loss": 1.1694, + "step": 166 + }, + { + "epoch": 0.028317083509961847, + "grad_norm": 1.3946877861404816, + "learning_rate": 1.8870056497175144e-05, + "loss": 1.1154, + "step": 167 + }, + { + "epoch": 0.028486646884272996, + "grad_norm": 1.4421720578692507, + "learning_rate": 1.898305084745763e-05, + "loss": 1.1094, + "step": 168 + }, + { + "epoch": 0.028656210258584145, + "grad_norm": 1.4552809730187664, + "learning_rate": 1.9096045197740114e-05, + "loss": 1.1056, + "step": 169 + }, + { + "epoch": 0.028825773632895294, + "grad_norm": 1.379880202176955, + "learning_rate": 1.92090395480226e-05, + "loss": 1.1584, + "step": 170 + }, + { + "epoch": 0.028995337007206443, + "grad_norm": 1.363145032946682, + "learning_rate": 1.9322033898305087e-05, + "loss": 1.1336, + "step": 171 + }, + { + "epoch": 0.029164900381517592, + "grad_norm": 1.6235858187409438, + "learning_rate": 1.9435028248587574e-05, + "loss": 1.1535, + "step": 172 + }, + { + "epoch": 0.02933446375582874, + "grad_norm": 1.3155078338219928, + "learning_rate": 1.9548022598870058e-05, + "loss": 1.1423, + "step": 173 + }, + { + "epoch": 0.02950402713013989, + "grad_norm": 1.296301744703282, + "learning_rate": 1.9661016949152545e-05, + "loss": 1.1682, + "step": 174 + }, + { + "epoch": 0.02967359050445104, + "grad_norm": 1.263899877948669, + "learning_rate": 1.977401129943503e-05, + "loss": 1.1183, + "step": 175 + }, + { + "epoch": 0.02984315387876219, + "grad_norm": 1.3585075935665827, + "learning_rate": 1.9887005649717518e-05, + "loss": 1.1356, + "step": 176 + }, + { + "epoch": 0.030012717253073338, + "grad_norm": 1.311067313289672, + "learning_rate": 2e-05, + "loss": 1.1398, + "step": 177 + }, + { + "epoch": 0.030182280627384483, + "grad_norm": 1.290727195525984, + "learning_rate": 1.9999998491734904e-05, + "loss": 1.0964, + "step": 178 + }, + { + "epoch": 0.030351844001695633, + "grad_norm": 1.3036669420154081, + "learning_rate": 1.9999993966940065e-05, + "loss": 1.121, + "step": 179 + }, + { + "epoch": 0.03052140737600678, + "grad_norm": 1.3479945804203648, + "learning_rate": 1.9999986425616854e-05, + "loss": 1.1593, + "step": 180 + }, + { + "epoch": 0.03069097075031793, + "grad_norm": 1.3390700969940623, + "learning_rate": 1.999997586776754e-05, + "loss": 1.1012, + "step": 181 + }, + { + "epoch": 0.03086053412462908, + "grad_norm": 1.3195809160200416, + "learning_rate": 1.9999962293395314e-05, + "loss": 1.1259, + "step": 182 + }, + { + "epoch": 0.03103009749894023, + "grad_norm": 1.4471383785189624, + "learning_rate": 1.9999945702504266e-05, + "loss": 1.1562, + "step": 183 + }, + { + "epoch": 0.031199660873251378, + "grad_norm": 1.2950706341674452, + "learning_rate": 1.99999260950994e-05, + "loss": 1.1021, + "step": 184 + }, + { + "epoch": 0.03136922424756253, + "grad_norm": 1.4357046922256813, + "learning_rate": 1.9999903471186634e-05, + "loss": 1.1355, + "step": 185 + }, + { + "epoch": 0.031538787621873676, + "grad_norm": 1.4081553660606614, + "learning_rate": 1.9999877830772793e-05, + "loss": 1.125, + "step": 186 + }, + { + "epoch": 0.031708350996184825, + "grad_norm": 1.4040886421642182, + "learning_rate": 1.9999849173865607e-05, + "loss": 1.1268, + "step": 187 + }, + { + "epoch": 0.031877914370495974, + "grad_norm": 1.5596826277587976, + "learning_rate": 1.9999817500473724e-05, + "loss": 1.1582, + "step": 188 + }, + { + "epoch": 0.032047477744807124, + "grad_norm": 1.4462905902458718, + "learning_rate": 1.9999782810606697e-05, + "loss": 1.1256, + "step": 189 + }, + { + "epoch": 0.03221704111911827, + "grad_norm": 1.3473764607427723, + "learning_rate": 1.9999745104274995e-05, + "loss": 1.1082, + "step": 190 + }, + { + "epoch": 0.03238660449342942, + "grad_norm": 1.3307164474361097, + "learning_rate": 1.9999704381489984e-05, + "loss": 1.0869, + "step": 191 + }, + { + "epoch": 0.03255616786774057, + "grad_norm": 1.45061392035038, + "learning_rate": 1.999966064226395e-05, + "loss": 1.1251, + "step": 192 + }, + { + "epoch": 0.03272573124205172, + "grad_norm": 1.3321117845551607, + "learning_rate": 1.9999613886610097e-05, + "loss": 1.117, + "step": 193 + }, + { + "epoch": 0.03289529461636287, + "grad_norm": 1.2742633797714435, + "learning_rate": 1.9999564114542516e-05, + "loss": 1.1095, + "step": 194 + }, + { + "epoch": 0.03306485799067401, + "grad_norm": 1.3759506593588158, + "learning_rate": 1.9999511326076227e-05, + "loss": 1.1472, + "step": 195 + }, + { + "epoch": 0.03323442136498516, + "grad_norm": 1.291439057363082, + "learning_rate": 1.9999455521227153e-05, + "loss": 1.0904, + "step": 196 + }, + { + "epoch": 0.03340398473929631, + "grad_norm": 1.3809527670799788, + "learning_rate": 1.9999396700012127e-05, + "loss": 1.1363, + "step": 197 + }, + { + "epoch": 0.03357354811360746, + "grad_norm": 1.2995526218260776, + "learning_rate": 1.9999334862448896e-05, + "loss": 1.1545, + "step": 198 + }, + { + "epoch": 0.03374311148791861, + "grad_norm": 1.2478798571680139, + "learning_rate": 1.9999270008556108e-05, + "loss": 1.1001, + "step": 199 + }, + { + "epoch": 0.03391267486222976, + "grad_norm": 1.2470480556423995, + "learning_rate": 1.999920213835333e-05, + "loss": 1.106, + "step": 200 + }, + { + "epoch": 0.034082238236540906, + "grad_norm": 1.2102661616659502, + "learning_rate": 1.9999131251861037e-05, + "loss": 1.1276, + "step": 201 + }, + { + "epoch": 0.034251801610852055, + "grad_norm": 1.2849980135270684, + "learning_rate": 1.9999057349100606e-05, + "loss": 1.087, + "step": 202 + }, + { + "epoch": 0.034421364985163204, + "grad_norm": 1.2262571763597132, + "learning_rate": 1.9998980430094333e-05, + "loss": 1.123, + "step": 203 + }, + { + "epoch": 0.03459092835947435, + "grad_norm": 1.3759934172355304, + "learning_rate": 1.9998900494865426e-05, + "loss": 1.1903, + "step": 204 + }, + { + "epoch": 0.0347604917337855, + "grad_norm": 1.2906012627920067, + "learning_rate": 1.999881754343799e-05, + "loss": 1.0835, + "step": 205 + }, + { + "epoch": 0.03493005510809665, + "grad_norm": 1.2864361841867833, + "learning_rate": 1.999873157583705e-05, + "loss": 1.1233, + "step": 206 + }, + { + "epoch": 0.0350996184824078, + "grad_norm": 1.422919504276044, + "learning_rate": 1.9998642592088543e-05, + "loss": 1.1504, + "step": 207 + }, + { + "epoch": 0.03526918185671895, + "grad_norm": 1.3010651375525106, + "learning_rate": 1.9998550592219303e-05, + "loss": 1.103, + "step": 208 + }, + { + "epoch": 0.0354387452310301, + "grad_norm": 1.2773819298360978, + "learning_rate": 1.999845557625709e-05, + "loss": 1.1236, + "step": 209 + }, + { + "epoch": 0.03560830860534125, + "grad_norm": 1.3252117783869803, + "learning_rate": 1.9998357544230558e-05, + "loss": 1.0974, + "step": 210 + }, + { + "epoch": 0.0357778719796524, + "grad_norm": 1.3167233220674777, + "learning_rate": 1.9998256496169282e-05, + "loss": 1.1693, + "step": 211 + }, + { + "epoch": 0.035947435353963546, + "grad_norm": 1.303191899641572, + "learning_rate": 1.999815243210375e-05, + "loss": 1.1376, + "step": 212 + }, + { + "epoch": 0.036116998728274695, + "grad_norm": 1.1944527257891109, + "learning_rate": 1.9998045352065342e-05, + "loss": 1.0969, + "step": 213 + }, + { + "epoch": 0.036286562102585844, + "grad_norm": 1.3268931468824336, + "learning_rate": 1.9997935256086367e-05, + "loss": 1.0888, + "step": 214 + }, + { + "epoch": 0.03645612547689699, + "grad_norm": 1.3839432423572338, + "learning_rate": 1.9997822144200035e-05, + "loss": 1.1582, + "step": 215 + }, + { + "epoch": 0.03662568885120814, + "grad_norm": 1.258912414155463, + "learning_rate": 1.9997706016440462e-05, + "loss": 1.118, + "step": 216 + }, + { + "epoch": 0.03679525222551929, + "grad_norm": 1.228358322020127, + "learning_rate": 1.9997586872842683e-05, + "loss": 1.1222, + "step": 217 + }, + { + "epoch": 0.03696481559983043, + "grad_norm": 1.25841793688745, + "learning_rate": 1.9997464713442632e-05, + "loss": 1.0823, + "step": 218 + }, + { + "epoch": 0.03713437897414158, + "grad_norm": 1.3041590051483232, + "learning_rate": 1.999733953827717e-05, + "loss": 1.0758, + "step": 219 + }, + { + "epoch": 0.03730394234845273, + "grad_norm": 1.2505716257886845, + "learning_rate": 1.9997211347384043e-05, + "loss": 1.0731, + "step": 220 + }, + { + "epoch": 0.03747350572276388, + "grad_norm": 1.3359828294697025, + "learning_rate": 1.9997080140801932e-05, + "loss": 1.1173, + "step": 221 + }, + { + "epoch": 0.03764306909707503, + "grad_norm": 1.3192749701037614, + "learning_rate": 1.9996945918570407e-05, + "loss": 1.0975, + "step": 222 + }, + { + "epoch": 0.03781263247138618, + "grad_norm": 1.2953997261408543, + "learning_rate": 1.999680868072996e-05, + "loss": 1.0848, + "step": 223 + }, + { + "epoch": 0.03798219584569733, + "grad_norm": 1.348437522113642, + "learning_rate": 1.999666842732199e-05, + "loss": 1.1355, + "step": 224 + }, + { + "epoch": 0.03815175922000848, + "grad_norm": 1.2739818156470117, + "learning_rate": 1.9996525158388804e-05, + "loss": 1.1777, + "step": 225 + }, + { + "epoch": 0.038321322594319626, + "grad_norm": 1.2926217899190098, + "learning_rate": 1.999637887397362e-05, + "loss": 1.0542, + "step": 226 + }, + { + "epoch": 0.038490885968630775, + "grad_norm": 1.2651121705688904, + "learning_rate": 1.9996229574120564e-05, + "loss": 1.1221, + "step": 227 + }, + { + "epoch": 0.038660449342941924, + "grad_norm": 1.3181402213368112, + "learning_rate": 1.9996077258874672e-05, + "loss": 1.139, + "step": 228 + }, + { + "epoch": 0.03883001271725307, + "grad_norm": 1.2378616508446028, + "learning_rate": 1.9995921928281893e-05, + "loss": 1.1037, + "step": 229 + }, + { + "epoch": 0.03899957609156422, + "grad_norm": 1.2611317412647842, + "learning_rate": 1.999576358238908e-05, + "loss": 1.0932, + "step": 230 + }, + { + "epoch": 0.03916913946587537, + "grad_norm": 1.193033326336458, + "learning_rate": 1.9995602221244007e-05, + "loss": 1.1087, + "step": 231 + }, + { + "epoch": 0.03933870284018652, + "grad_norm": 1.2303651899327985, + "learning_rate": 1.9995437844895337e-05, + "loss": 1.0697, + "step": 232 + }, + { + "epoch": 0.03950826621449767, + "grad_norm": 1.3653670662742932, + "learning_rate": 1.999527045339266e-05, + "loss": 1.0869, + "step": 233 + }, + { + "epoch": 0.03967782958880882, + "grad_norm": 1.32989695880418, + "learning_rate": 1.999510004678647e-05, + "loss": 1.1396, + "step": 234 + }, + { + "epoch": 0.03984739296311997, + "grad_norm": 1.3648126580032514, + "learning_rate": 1.999492662512817e-05, + "loss": 1.1402, + "step": 235 + }, + { + "epoch": 0.04001695633743112, + "grad_norm": 1.3643161893927116, + "learning_rate": 1.9994750188470076e-05, + "loss": 1.1381, + "step": 236 + }, + { + "epoch": 0.040186519711742266, + "grad_norm": 1.3115065822462593, + "learning_rate": 1.9994570736865407e-05, + "loss": 1.1496, + "step": 237 + }, + { + "epoch": 0.040356083086053415, + "grad_norm": 1.293568914755473, + "learning_rate": 1.99943882703683e-05, + "loss": 1.1124, + "step": 238 + }, + { + "epoch": 0.040525646460364564, + "grad_norm": 1.3105142284743228, + "learning_rate": 1.9994202789033787e-05, + "loss": 1.144, + "step": 239 + }, + { + "epoch": 0.040695209834675714, + "grad_norm": 1.3437109322210967, + "learning_rate": 1.999401429291783e-05, + "loss": 1.1265, + "step": 240 + }, + { + "epoch": 0.040864773208986856, + "grad_norm": 1.2110366779560244, + "learning_rate": 1.9993822782077282e-05, + "loss": 1.1233, + "step": 241 + }, + { + "epoch": 0.041034336583298005, + "grad_norm": 1.2498208989585575, + "learning_rate": 1.999362825656992e-05, + "loss": 1.1317, + "step": 242 + }, + { + "epoch": 0.041203899957609154, + "grad_norm": 1.2945037951769887, + "learning_rate": 1.9993430716454415e-05, + "loss": 1.074, + "step": 243 + }, + { + "epoch": 0.0413734633319203, + "grad_norm": 1.2917078153507724, + "learning_rate": 1.999323016179036e-05, + "loss": 1.1051, + "step": 244 + }, + { + "epoch": 0.04154302670623145, + "grad_norm": 1.3256353982242184, + "learning_rate": 1.999302659263825e-05, + "loss": 1.1196, + "step": 245 + }, + { + "epoch": 0.0417125900805426, + "grad_norm": 1.3042273541322487, + "learning_rate": 1.9992820009059496e-05, + "loss": 1.0862, + "step": 246 + }, + { + "epoch": 0.04188215345485375, + "grad_norm": 1.232714465260533, + "learning_rate": 1.9992610411116416e-05, + "loss": 1.1242, + "step": 247 + }, + { + "epoch": 0.0420517168291649, + "grad_norm": 1.274140585702574, + "learning_rate": 1.9992397798872233e-05, + "loss": 1.0831, + "step": 248 + }, + { + "epoch": 0.04222128020347605, + "grad_norm": 1.3210616986271977, + "learning_rate": 1.999218217239108e-05, + "loss": 1.1094, + "step": 249 + }, + { + "epoch": 0.0423908435777872, + "grad_norm": 1.318150501739465, + "learning_rate": 1.9991963531738e-05, + "loss": 1.1192, + "step": 250 + }, + { + "epoch": 0.04256040695209835, + "grad_norm": 1.3431518541447707, + "learning_rate": 1.9991741876978953e-05, + "loss": 1.1231, + "step": 251 + }, + { + "epoch": 0.042729970326409496, + "grad_norm": 1.2834723290563341, + "learning_rate": 1.99915172081808e-05, + "loss": 1.1467, + "step": 252 + }, + { + "epoch": 0.042899533700720645, + "grad_norm": 1.233197047980099, + "learning_rate": 1.9991289525411308e-05, + "loss": 1.0816, + "step": 253 + }, + { + "epoch": 0.043069097075031794, + "grad_norm": 1.4200243646148094, + "learning_rate": 1.9991058828739164e-05, + "loss": 1.1204, + "step": 254 + }, + { + "epoch": 0.04323866044934294, + "grad_norm": 1.3172495315828985, + "learning_rate": 1.9990825118233958e-05, + "loss": 1.0725, + "step": 255 + }, + { + "epoch": 0.04340822382365409, + "grad_norm": 1.3090225638033592, + "learning_rate": 1.999058839396619e-05, + "loss": 1.1622, + "step": 256 + }, + { + "epoch": 0.04357778719796524, + "grad_norm": 1.2377323447279194, + "learning_rate": 1.999034865600726e-05, + "loss": 1.075, + "step": 257 + }, + { + "epoch": 0.04374735057227639, + "grad_norm": 1.4027732073731827, + "learning_rate": 1.9990105904429496e-05, + "loss": 1.0977, + "step": 258 + }, + { + "epoch": 0.04391691394658754, + "grad_norm": 1.2538009108391588, + "learning_rate": 1.9989860139306122e-05, + "loss": 1.1074, + "step": 259 + }, + { + "epoch": 0.04408647732089869, + "grad_norm": 1.2678297263590095, + "learning_rate": 1.998961136071127e-05, + "loss": 1.1103, + "step": 260 + }, + { + "epoch": 0.04425604069520984, + "grad_norm": 1.33640403897779, + "learning_rate": 1.9989359568719988e-05, + "loss": 1.0847, + "step": 261 + }, + { + "epoch": 0.04442560406952099, + "grad_norm": 1.2378980641503972, + "learning_rate": 1.998910476340823e-05, + "loss": 1.1212, + "step": 262 + }, + { + "epoch": 0.04459516744383213, + "grad_norm": 1.3400055287563686, + "learning_rate": 1.998884694485286e-05, + "loss": 1.1497, + "step": 263 + }, + { + "epoch": 0.04476473081814328, + "grad_norm": 1.2067041306244417, + "learning_rate": 1.9988586113131644e-05, + "loss": 1.0468, + "step": 264 + }, + { + "epoch": 0.04493429419245443, + "grad_norm": 1.2324082792925706, + "learning_rate": 1.998832226832327e-05, + "loss": 1.1453, + "step": 265 + }, + { + "epoch": 0.045103857566765576, + "grad_norm": 1.4597384904677964, + "learning_rate": 1.9988055410507318e-05, + "loss": 0.9068, + "step": 266 + }, + { + "epoch": 0.045273420941076725, + "grad_norm": 1.2517477406537323, + "learning_rate": 1.99877855397643e-05, + "loss": 1.0359, + "step": 267 + }, + { + "epoch": 0.045442984315387874, + "grad_norm": 1.4215378641837717, + "learning_rate": 1.9987512656175612e-05, + "loss": 1.1356, + "step": 268 + }, + { + "epoch": 0.04561254768969902, + "grad_norm": 1.2567936902932118, + "learning_rate": 1.9987236759823573e-05, + "loss": 1.1055, + "step": 269 + }, + { + "epoch": 0.04578211106401017, + "grad_norm": 1.351280629298554, + "learning_rate": 1.998695785079141e-05, + "loss": 1.1201, + "step": 270 + }, + { + "epoch": 0.04595167443832132, + "grad_norm": 0.919807199887456, + "learning_rate": 1.998667592916326e-05, + "loss": 0.9956, + "step": 271 + }, + { + "epoch": 0.04612123781263247, + "grad_norm": 1.4444332019381603, + "learning_rate": 1.9986390995024157e-05, + "loss": 1.0809, + "step": 272 + }, + { + "epoch": 0.04629080118694362, + "grad_norm": 0.7227797429961299, + "learning_rate": 1.9986103048460056e-05, + "loss": 0.8795, + "step": 273 + }, + { + "epoch": 0.04646036456125477, + "grad_norm": 1.3839871147489478, + "learning_rate": 1.998581208955782e-05, + "loss": 1.0353, + "step": 274 + }, + { + "epoch": 0.04662992793556592, + "grad_norm": 1.3734487260979455, + "learning_rate": 1.998551811840521e-05, + "loss": 1.1141, + "step": 275 + }, + { + "epoch": 0.04679949130987707, + "grad_norm": 1.4209810351806924, + "learning_rate": 1.9985221135090917e-05, + "loss": 1.1568, + "step": 276 + }, + { + "epoch": 0.046969054684188216, + "grad_norm": 1.298446091150691, + "learning_rate": 1.998492113970451e-05, + "loss": 1.0929, + "step": 277 + }, + { + "epoch": 0.047138618058499365, + "grad_norm": 1.3256210754654363, + "learning_rate": 1.99846181323365e-05, + "loss": 1.0858, + "step": 278 + }, + { + "epoch": 0.047308181432810514, + "grad_norm": 0.8731203119877494, + "learning_rate": 1.9984312113078276e-05, + "loss": 0.8564, + "step": 279 + }, + { + "epoch": 0.04747774480712166, + "grad_norm": 1.5216913951133884, + "learning_rate": 1.9984003082022153e-05, + "loss": 1.1074, + "step": 280 + }, + { + "epoch": 0.04764730818143281, + "grad_norm": 1.4487908926953819, + "learning_rate": 1.9983691039261358e-05, + "loss": 1.1528, + "step": 281 + }, + { + "epoch": 0.04781687155574396, + "grad_norm": 1.3396040211574347, + "learning_rate": 1.9983375984890013e-05, + "loss": 1.0791, + "step": 282 + }, + { + "epoch": 0.04798643493005511, + "grad_norm": 1.3752837460679006, + "learning_rate": 1.9983057919003162e-05, + "loss": 1.0966, + "step": 283 + }, + { + "epoch": 0.04815599830436626, + "grad_norm": 1.4920780971869654, + "learning_rate": 1.998273684169674e-05, + "loss": 1.1034, + "step": 284 + }, + { + "epoch": 0.04832556167867741, + "grad_norm": 1.453554721673209, + "learning_rate": 1.998241275306761e-05, + "loss": 1.1418, + "step": 285 + }, + { + "epoch": 0.04849512505298855, + "grad_norm": 1.399795858996511, + "learning_rate": 1.9982085653213535e-05, + "loss": 1.0706, + "step": 286 + }, + { + "epoch": 0.0486646884272997, + "grad_norm": 1.2899224919414538, + "learning_rate": 1.9981755542233175e-05, + "loss": 1.0845, + "step": 287 + }, + { + "epoch": 0.04883425180161085, + "grad_norm": 1.4158557727233918, + "learning_rate": 1.9981422420226118e-05, + "loss": 1.1067, + "step": 288 + }, + { + "epoch": 0.049003815175922, + "grad_norm": 1.2799294452344585, + "learning_rate": 1.9981086287292853e-05, + "loss": 1.0695, + "step": 289 + }, + { + "epoch": 0.04917337855023315, + "grad_norm": 1.301157748670505, + "learning_rate": 1.998074714353477e-05, + "loss": 1.1089, + "step": 290 + }, + { + "epoch": 0.0493429419245443, + "grad_norm": 1.448619349235768, + "learning_rate": 1.9980404989054172e-05, + "loss": 1.1176, + "step": 291 + }, + { + "epoch": 0.049512505298855446, + "grad_norm": 1.1721849740370327, + "learning_rate": 1.9980059823954274e-05, + "loss": 1.1404, + "step": 292 + }, + { + "epoch": 0.049682068673166595, + "grad_norm": 1.298641860447234, + "learning_rate": 1.9979711648339192e-05, + "loss": 1.1077, + "step": 293 + }, + { + "epoch": 0.049851632047477744, + "grad_norm": 1.4437782843668467, + "learning_rate": 1.9979360462313965e-05, + "loss": 1.1086, + "step": 294 + }, + { + "epoch": 0.05002119542178889, + "grad_norm": 1.389790720831879, + "learning_rate": 1.9979006265984516e-05, + "loss": 1.107, + "step": 295 + }, + { + "epoch": 0.05019075879610004, + "grad_norm": 1.1748746125180785, + "learning_rate": 1.99786490594577e-05, + "loss": 1.0906, + "step": 296 + }, + { + "epoch": 0.05036032217041119, + "grad_norm": 1.2299969821770056, + "learning_rate": 1.997828884284126e-05, + "loss": 1.1117, + "step": 297 + }, + { + "epoch": 0.05052988554472234, + "grad_norm": 1.5034250884078735, + "learning_rate": 1.9977925616243865e-05, + "loss": 1.131, + "step": 298 + }, + { + "epoch": 0.05069944891903349, + "grad_norm": 1.2757184845012663, + "learning_rate": 1.9977559379775077e-05, + "loss": 1.1091, + "step": 299 + }, + { + "epoch": 0.05086901229334464, + "grad_norm": 1.1822834963848412, + "learning_rate": 1.9977190133545378e-05, + "loss": 1.0557, + "step": 300 + }, + { + "epoch": 0.05103857566765579, + "grad_norm": 1.208059478563468, + "learning_rate": 1.9976817877666143e-05, + "loss": 1.0831, + "step": 301 + }, + { + "epoch": 0.05120813904196694, + "grad_norm": 1.324640310424657, + "learning_rate": 1.9976442612249677e-05, + "loss": 1.0916, + "step": 302 + }, + { + "epoch": 0.051377702416278086, + "grad_norm": 1.308192313784638, + "learning_rate": 1.997606433740917e-05, + "loss": 1.1033, + "step": 303 + }, + { + "epoch": 0.051547265790589235, + "grad_norm": 1.2608508888907461, + "learning_rate": 1.9975683053258734e-05, + "loss": 1.1248, + "step": 304 + }, + { + "epoch": 0.051716829164900384, + "grad_norm": 1.2454374853169659, + "learning_rate": 1.9975298759913382e-05, + "loss": 1.158, + "step": 305 + }, + { + "epoch": 0.05188639253921153, + "grad_norm": 1.2495908625588392, + "learning_rate": 1.9974911457489038e-05, + "loss": 1.1099, + "step": 306 + }, + { + "epoch": 0.05205595591352268, + "grad_norm": 1.2045956229853965, + "learning_rate": 1.9974521146102535e-05, + "loss": 1.0886, + "step": 307 + }, + { + "epoch": 0.05222551928783383, + "grad_norm": 1.2316926551167495, + "learning_rate": 1.997412782587161e-05, + "loss": 1.1211, + "step": 308 + }, + { + "epoch": 0.05239508266214497, + "grad_norm": 1.239607517766241, + "learning_rate": 1.9973731496914914e-05, + "loss": 1.0752, + "step": 309 + }, + { + "epoch": 0.05256464603645612, + "grad_norm": 1.1464479852440976, + "learning_rate": 1.997333215935199e-05, + "loss": 1.0865, + "step": 310 + }, + { + "epoch": 0.05273420941076727, + "grad_norm": 1.3018269907301956, + "learning_rate": 1.997292981330331e-05, + "loss": 1.1136, + "step": 311 + }, + { + "epoch": 0.05290377278507842, + "grad_norm": 1.1685461379112567, + "learning_rate": 1.9972524458890238e-05, + "loss": 1.0699, + "step": 312 + }, + { + "epoch": 0.05307333615938957, + "grad_norm": 1.3125531381497202, + "learning_rate": 1.9972116096235047e-05, + "loss": 1.1312, + "step": 313 + }, + { + "epoch": 0.05324289953370072, + "grad_norm": 1.226818289966763, + "learning_rate": 1.997170472546093e-05, + "loss": 1.0853, + "step": 314 + }, + { + "epoch": 0.05341246290801187, + "grad_norm": 1.2230916127999136, + "learning_rate": 1.997129034669197e-05, + "loss": 1.1326, + "step": 315 + }, + { + "epoch": 0.05358202628232302, + "grad_norm": 1.3512152142512985, + "learning_rate": 1.997087296005317e-05, + "loss": 1.1461, + "step": 316 + }, + { + "epoch": 0.053751589656634166, + "grad_norm": 1.1440299956887134, + "learning_rate": 1.9970452565670432e-05, + "loss": 1.0678, + "step": 317 + }, + { + "epoch": 0.053921153030945315, + "grad_norm": 1.2169658630028495, + "learning_rate": 1.9970029163670573e-05, + "loss": 1.1072, + "step": 318 + }, + { + "epoch": 0.054090716405256464, + "grad_norm": 1.3080151660027735, + "learning_rate": 1.9969602754181316e-05, + "loss": 1.1072, + "step": 319 + }, + { + "epoch": 0.05426027977956761, + "grad_norm": 1.1625388269821455, + "learning_rate": 1.9969173337331283e-05, + "loss": 1.0435, + "step": 320 + }, + { + "epoch": 0.05442984315387876, + "grad_norm": 1.3244220672285003, + "learning_rate": 1.996874091325001e-05, + "loss": 1.1208, + "step": 321 + }, + { + "epoch": 0.05459940652818991, + "grad_norm": 1.3333021747321667, + "learning_rate": 1.996830548206794e-05, + "loss": 1.1382, + "step": 322 + }, + { + "epoch": 0.05476896990250106, + "grad_norm": 0.8435225189212632, + "learning_rate": 1.996786704391642e-05, + "loss": 0.7844, + "step": 323 + }, + { + "epoch": 0.05493853327681221, + "grad_norm": 1.393060292107635, + "learning_rate": 1.9967425598927713e-05, + "loss": 1.1377, + "step": 324 + }, + { + "epoch": 0.05510809665112336, + "grad_norm": 1.2256625255974176, + "learning_rate": 1.9966981147234975e-05, + "loss": 1.0511, + "step": 325 + }, + { + "epoch": 0.05527766002543451, + "grad_norm": 1.2184046483945477, + "learning_rate": 1.996653368897228e-05, + "loss": 1.1291, + "step": 326 + }, + { + "epoch": 0.05544722339974566, + "grad_norm": 1.2851047625914724, + "learning_rate": 1.99660832242746e-05, + "loss": 1.0623, + "step": 327 + }, + { + "epoch": 0.055616786774056806, + "grad_norm": 1.185571974883055, + "learning_rate": 1.9965629753277825e-05, + "loss": 1.1082, + "step": 328 + }, + { + "epoch": 0.055786350148367955, + "grad_norm": 1.1915535921078764, + "learning_rate": 1.9965173276118747e-05, + "loss": 1.1088, + "step": 329 + }, + { + "epoch": 0.055955913522679104, + "grad_norm": 1.269068710320991, + "learning_rate": 1.9964713792935055e-05, + "loss": 1.0913, + "step": 330 + }, + { + "epoch": 0.05612547689699025, + "grad_norm": 1.2993242144559132, + "learning_rate": 1.9964251303865362e-05, + "loss": 1.1576, + "step": 331 + }, + { + "epoch": 0.056295040271301396, + "grad_norm": 1.2078455062910056, + "learning_rate": 1.9963785809049177e-05, + "loss": 1.0755, + "step": 332 + }, + { + "epoch": 0.056464603645612545, + "grad_norm": 1.1571021624315911, + "learning_rate": 1.9963317308626916e-05, + "loss": 1.0831, + "step": 333 + }, + { + "epoch": 0.056634167019923694, + "grad_norm": 1.1504441341898868, + "learning_rate": 1.9962845802739905e-05, + "loss": 1.0871, + "step": 334 + }, + { + "epoch": 0.05680373039423484, + "grad_norm": 1.2051311917349259, + "learning_rate": 1.9962371291530375e-05, + "loss": 1.1117, + "step": 335 + }, + { + "epoch": 0.05697329376854599, + "grad_norm": 1.2509410873161557, + "learning_rate": 1.996189377514146e-05, + "loss": 1.0727, + "step": 336 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 1.2245819111691587, + "learning_rate": 1.9961413253717214e-05, + "loss": 1.1031, + "step": 337 + }, + { + "epoch": 0.05731242051716829, + "grad_norm": 1.184736822543079, + "learning_rate": 1.9960929727402578e-05, + "loss": 1.0962, + "step": 338 + }, + { + "epoch": 0.05748198389147944, + "grad_norm": 1.2594965296566927, + "learning_rate": 1.9960443196343413e-05, + "loss": 1.0875, + "step": 339 + }, + { + "epoch": 0.05765154726579059, + "grad_norm": 1.203417472683344, + "learning_rate": 1.995995366068648e-05, + "loss": 1.0666, + "step": 340 + }, + { + "epoch": 0.05782111064010174, + "grad_norm": 1.1526596254622905, + "learning_rate": 1.9959461120579457e-05, + "loss": 1.0517, + "step": 341 + }, + { + "epoch": 0.05799067401441289, + "grad_norm": 1.2189027654989488, + "learning_rate": 1.995896557617091e-05, + "loss": 1.1128, + "step": 342 + }, + { + "epoch": 0.058160237388724036, + "grad_norm": 1.2449802055381771, + "learning_rate": 1.9958467027610328e-05, + "loss": 1.09, + "step": 343 + }, + { + "epoch": 0.058329800763035185, + "grad_norm": 1.2372792210636403, + "learning_rate": 1.9957965475048097e-05, + "loss": 1.1435, + "step": 344 + }, + { + "epoch": 0.058499364137346334, + "grad_norm": 1.159603118192446, + "learning_rate": 1.9957460918635513e-05, + "loss": 1.0912, + "step": 345 + }, + { + "epoch": 0.05866892751165748, + "grad_norm": 1.1796547515743827, + "learning_rate": 1.9956953358524774e-05, + "loss": 1.1273, + "step": 346 + }, + { + "epoch": 0.05883849088596863, + "grad_norm": 1.1463535463566787, + "learning_rate": 1.995644279486899e-05, + "loss": 1.0726, + "step": 347 + }, + { + "epoch": 0.05900805426027978, + "grad_norm": 1.2773193191908023, + "learning_rate": 1.995592922782218e-05, + "loss": 1.0754, + "step": 348 + }, + { + "epoch": 0.05917761763459093, + "grad_norm": 1.1605677125546763, + "learning_rate": 1.995541265753925e-05, + "loss": 1.12, + "step": 349 + }, + { + "epoch": 0.05934718100890208, + "grad_norm": 1.2407734754453872, + "learning_rate": 1.9954893084176032e-05, + "loss": 1.1042, + "step": 350 + }, + { + "epoch": 0.05951674438321323, + "grad_norm": 1.1425448429760805, + "learning_rate": 1.9954370507889257e-05, + "loss": 1.0759, + "step": 351 + }, + { + "epoch": 0.05968630775752438, + "grad_norm": 1.2255570962994675, + "learning_rate": 1.9953844928836563e-05, + "loss": 1.12, + "step": 352 + }, + { + "epoch": 0.05985587113183553, + "grad_norm": 1.1143633416997376, + "learning_rate": 1.995331634717649e-05, + "loss": 1.0537, + "step": 353 + }, + { + "epoch": 0.060025434506146676, + "grad_norm": 1.1628787829315388, + "learning_rate": 1.995278476306848e-05, + "loss": 1.0933, + "step": 354 + }, + { + "epoch": 0.06019499788045782, + "grad_norm": 1.209405637329876, + "learning_rate": 1.9952250176672904e-05, + "loss": 1.0961, + "step": 355 + }, + { + "epoch": 0.06036456125476897, + "grad_norm": 1.2253620272384542, + "learning_rate": 1.9951712588151005e-05, + "loss": 1.0816, + "step": 356 + }, + { + "epoch": 0.060534124629080116, + "grad_norm": 1.2668748619197234, + "learning_rate": 1.995117199766496e-05, + "loss": 1.1292, + "step": 357 + }, + { + "epoch": 0.060703688003391265, + "grad_norm": 1.2279281497116579, + "learning_rate": 1.995062840537783e-05, + "loss": 1.1093, + "step": 358 + }, + { + "epoch": 0.060873251377702414, + "grad_norm": 1.1857563656955354, + "learning_rate": 1.9950081811453598e-05, + "loss": 1.113, + "step": 359 + }, + { + "epoch": 0.06104281475201356, + "grad_norm": 1.2608582753493773, + "learning_rate": 1.9949532216057143e-05, + "loss": 1.1107, + "step": 360 + }, + { + "epoch": 0.06121237812632471, + "grad_norm": 1.1941519844686572, + "learning_rate": 1.9948979619354253e-05, + "loss": 1.0875, + "step": 361 + }, + { + "epoch": 0.06138194150063586, + "grad_norm": 1.070663337506809, + "learning_rate": 1.9948424021511617e-05, + "loss": 0.8719, + "step": 362 + }, + { + "epoch": 0.06155150487494701, + "grad_norm": 1.329920784246183, + "learning_rate": 1.9947865422696837e-05, + "loss": 1.1196, + "step": 363 + }, + { + "epoch": 0.06172106824925816, + "grad_norm": 1.2011226625918563, + "learning_rate": 1.9947303823078418e-05, + "loss": 1.074, + "step": 364 + }, + { + "epoch": 0.06189063162356931, + "grad_norm": 1.1737289768791428, + "learning_rate": 1.994673922282576e-05, + "loss": 1.0572, + "step": 365 + }, + { + "epoch": 0.06206019499788046, + "grad_norm": 1.1692347436747448, + "learning_rate": 1.9946171622109182e-05, + "loss": 1.0882, + "step": 366 + }, + { + "epoch": 0.06222975837219161, + "grad_norm": 1.173485876145163, + "learning_rate": 1.9945601021099903e-05, + "loss": 1.0622, + "step": 367 + }, + { + "epoch": 0.062399321746502756, + "grad_norm": 1.0939343714424123, + "learning_rate": 1.9945027419970045e-05, + "loss": 1.0644, + "step": 368 + }, + { + "epoch": 0.0625688851208139, + "grad_norm": 1.1938438449064799, + "learning_rate": 1.994445081889264e-05, + "loss": 1.1187, + "step": 369 + }, + { + "epoch": 0.06273844849512505, + "grad_norm": 1.1134512299885773, + "learning_rate": 1.9943871218041617e-05, + "loss": 1.0974, + "step": 370 + }, + { + "epoch": 0.0629080118694362, + "grad_norm": 1.1283419443504543, + "learning_rate": 1.9943288617591813e-05, + "loss": 1.0761, + "step": 371 + }, + { + "epoch": 0.06307757524374735, + "grad_norm": 1.1469298461894093, + "learning_rate": 1.9942703017718977e-05, + "loss": 1.1146, + "step": 372 + }, + { + "epoch": 0.0632471386180585, + "grad_norm": 1.1976903772081748, + "learning_rate": 1.994211441859975e-05, + "loss": 1.1162, + "step": 373 + }, + { + "epoch": 0.06341670199236965, + "grad_norm": 1.259642806436094, + "learning_rate": 1.994152282041169e-05, + "loss": 1.1244, + "step": 374 + }, + { + "epoch": 0.0635862653666808, + "grad_norm": 1.217164619750138, + "learning_rate": 1.9940928223333254e-05, + "loss": 1.1002, + "step": 375 + }, + { + "epoch": 0.06375582874099195, + "grad_norm": 1.3102709114404878, + "learning_rate": 1.99403306275438e-05, + "loss": 1.1375, + "step": 376 + }, + { + "epoch": 0.0639253921153031, + "grad_norm": 1.252171499091104, + "learning_rate": 1.99397300332236e-05, + "loss": 1.1143, + "step": 377 + }, + { + "epoch": 0.06409495548961425, + "grad_norm": 1.1700395423595562, + "learning_rate": 1.9939126440553824e-05, + "loss": 1.119, + "step": 378 + }, + { + "epoch": 0.0642645188639254, + "grad_norm": 1.190503582506576, + "learning_rate": 1.9938519849716545e-05, + "loss": 1.0989, + "step": 379 + }, + { + "epoch": 0.06443408223823655, + "grad_norm": 1.159521427169997, + "learning_rate": 1.9937910260894742e-05, + "loss": 1.1476, + "step": 380 + }, + { + "epoch": 0.0646036456125477, + "grad_norm": 1.2243375821608091, + "learning_rate": 1.9937297674272302e-05, + "loss": 1.1052, + "step": 381 + }, + { + "epoch": 0.06477320898685884, + "grad_norm": 1.1052392237568562, + "learning_rate": 1.9936682090034014e-05, + "loss": 1.0732, + "step": 382 + }, + { + "epoch": 0.06494277236116999, + "grad_norm": 1.133507141141504, + "learning_rate": 1.993606350836557e-05, + "loss": 1.0952, + "step": 383 + }, + { + "epoch": 0.06511233573548114, + "grad_norm": 1.2441764920292675, + "learning_rate": 1.9935441929453564e-05, + "loss": 1.0923, + "step": 384 + }, + { + "epoch": 0.06528189910979229, + "grad_norm": 0.9421309435936914, + "learning_rate": 1.99348173534855e-05, + "loss": 0.8583, + "step": 385 + }, + { + "epoch": 0.06545146248410344, + "grad_norm": 1.139031396937891, + "learning_rate": 1.993418978064979e-05, + "loss": 1.0597, + "step": 386 + }, + { + "epoch": 0.06562102585841459, + "grad_norm": 1.1901071034133301, + "learning_rate": 1.993355921113573e-05, + "loss": 1.0511, + "step": 387 + }, + { + "epoch": 0.06579058923272574, + "grad_norm": 1.162965069394226, + "learning_rate": 1.9932925645133542e-05, + "loss": 1.0877, + "step": 388 + }, + { + "epoch": 0.06596015260703687, + "grad_norm": 1.2311993497742715, + "learning_rate": 1.9932289082834342e-05, + "loss": 1.1016, + "step": 389 + }, + { + "epoch": 0.06612971598134802, + "grad_norm": 1.2590527889026113, + "learning_rate": 1.9931649524430144e-05, + "loss": 1.1275, + "step": 390 + }, + { + "epoch": 0.06629927935565917, + "grad_norm": 1.1992032106678683, + "learning_rate": 1.9931006970113884e-05, + "loss": 1.0881, + "step": 391 + }, + { + "epoch": 0.06646884272997032, + "grad_norm": 1.1944888911389537, + "learning_rate": 1.9930361420079385e-05, + "loss": 1.0695, + "step": 392 + }, + { + "epoch": 0.06663840610428147, + "grad_norm": 1.2642092133103153, + "learning_rate": 1.9929712874521375e-05, + "loss": 1.104, + "step": 393 + }, + { + "epoch": 0.06680796947859262, + "grad_norm": 1.1480843498209758, + "learning_rate": 1.99290613336355e-05, + "loss": 1.0967, + "step": 394 + }, + { + "epoch": 0.06697753285290377, + "grad_norm": 1.3089210241642424, + "learning_rate": 1.9928406797618285e-05, + "loss": 1.1199, + "step": 395 + }, + { + "epoch": 0.06714709622721492, + "grad_norm": 1.1609342410390617, + "learning_rate": 1.9927749266667185e-05, + "loss": 1.0698, + "step": 396 + }, + { + "epoch": 0.06731665960152607, + "grad_norm": 1.1602040335488506, + "learning_rate": 1.992708874098054e-05, + "loss": 1.0808, + "step": 397 + }, + { + "epoch": 0.06748622297583722, + "grad_norm": 1.1668530858299222, + "learning_rate": 1.9926425220757607e-05, + "loss": 1.0477, + "step": 398 + }, + { + "epoch": 0.06765578635014836, + "grad_norm": 1.269660516529872, + "learning_rate": 1.9925758706198527e-05, + "loss": 1.1181, + "step": 399 + }, + { + "epoch": 0.06782534972445951, + "grad_norm": 1.2221527265069834, + "learning_rate": 1.9925089197504363e-05, + "loss": 1.0742, + "step": 400 + }, + { + "epoch": 0.06799491309877066, + "grad_norm": 1.2424829682065042, + "learning_rate": 1.9924416694877077e-05, + "loss": 1.1273, + "step": 401 + }, + { + "epoch": 0.06816447647308181, + "grad_norm": 1.1546686928366987, + "learning_rate": 1.9923741198519528e-05, + "loss": 1.0664, + "step": 402 + }, + { + "epoch": 0.06833403984739296, + "grad_norm": 1.2232646211054141, + "learning_rate": 1.992306270863548e-05, + "loss": 1.0714, + "step": 403 + }, + { + "epoch": 0.06850360322170411, + "grad_norm": 1.2449826857512076, + "learning_rate": 1.9922381225429605e-05, + "loss": 1.066, + "step": 404 + }, + { + "epoch": 0.06867316659601526, + "grad_norm": 0.9073812725135206, + "learning_rate": 1.992169674910747e-05, + "loss": 0.8867, + "step": 405 + }, + { + "epoch": 0.06884272997032641, + "grad_norm": 1.3337548424910155, + "learning_rate": 1.9921009279875555e-05, + "loss": 1.109, + "step": 406 + }, + { + "epoch": 0.06901229334463756, + "grad_norm": 0.7421763544150676, + "learning_rate": 1.9920318817941234e-05, + "loss": 0.8916, + "step": 407 + }, + { + "epoch": 0.0691818567189487, + "grad_norm": 1.3251686230747073, + "learning_rate": 1.9919625363512788e-05, + "loss": 1.1143, + "step": 408 + }, + { + "epoch": 0.06935142009325986, + "grad_norm": 1.3170148414271075, + "learning_rate": 1.9918928916799395e-05, + "loss": 1.0978, + "step": 409 + }, + { + "epoch": 0.069520983467571, + "grad_norm": 1.2335438282923488, + "learning_rate": 1.9918229478011146e-05, + "loss": 1.063, + "step": 410 + }, + { + "epoch": 0.06969054684188215, + "grad_norm": 1.333054504408273, + "learning_rate": 1.991752704735903e-05, + "loss": 1.1366, + "step": 411 + }, + { + "epoch": 0.0698601102161933, + "grad_norm": 1.1143623075087468, + "learning_rate": 1.9916821625054932e-05, + "loss": 1.0715, + "step": 412 + }, + { + "epoch": 0.07002967359050445, + "grad_norm": 1.2582105757851425, + "learning_rate": 1.9916113211311647e-05, + "loss": 1.1078, + "step": 413 + }, + { + "epoch": 0.0701992369648156, + "grad_norm": 1.2131117395052011, + "learning_rate": 1.991540180634287e-05, + "loss": 1.0947, + "step": 414 + }, + { + "epoch": 0.07036880033912675, + "grad_norm": 1.1018529883468215, + "learning_rate": 1.9914687410363196e-05, + "loss": 1.064, + "step": 415 + }, + { + "epoch": 0.0705383637134379, + "grad_norm": 1.2711756508896594, + "learning_rate": 1.991397002358813e-05, + "loss": 1.1139, + "step": 416 + }, + { + "epoch": 0.07070792708774905, + "grad_norm": 1.1915856433178176, + "learning_rate": 1.9913249646234072e-05, + "loss": 1.0784, + "step": 417 + }, + { + "epoch": 0.0708774904620602, + "grad_norm": 1.2336558096852979, + "learning_rate": 1.991252627851832e-05, + "loss": 1.0623, + "step": 418 + }, + { + "epoch": 0.07104705383637135, + "grad_norm": 1.1645400412963238, + "learning_rate": 1.9911799920659093e-05, + "loss": 1.0914, + "step": 419 + }, + { + "epoch": 0.0712166172106825, + "grad_norm": 1.1920752880656034, + "learning_rate": 1.991107057287549e-05, + "loss": 1.0821, + "step": 420 + }, + { + "epoch": 0.07138618058499364, + "grad_norm": 1.1733810711305679, + "learning_rate": 1.9910338235387517e-05, + "loss": 1.1181, + "step": 421 + }, + { + "epoch": 0.0715557439593048, + "grad_norm": 1.2048882987640317, + "learning_rate": 1.9909602908416096e-05, + "loss": 1.086, + "step": 422 + }, + { + "epoch": 0.07172530733361594, + "grad_norm": 1.226238560221633, + "learning_rate": 1.9908864592183034e-05, + "loss": 1.077, + "step": 423 + }, + { + "epoch": 0.07189487070792709, + "grad_norm": 1.087579504878477, + "learning_rate": 1.990812328691105e-05, + "loss": 1.0702, + "step": 424 + }, + { + "epoch": 0.07206443408223824, + "grad_norm": 1.1444768150288578, + "learning_rate": 1.9907378992823755e-05, + "loss": 1.1015, + "step": 425 + }, + { + "epoch": 0.07223399745654939, + "grad_norm": 1.1231516590945694, + "learning_rate": 1.9906631710145672e-05, + "loss": 1.0619, + "step": 426 + }, + { + "epoch": 0.07240356083086054, + "grad_norm": 1.1735386016953435, + "learning_rate": 1.9905881439102222e-05, + "loss": 1.0811, + "step": 427 + }, + { + "epoch": 0.07257312420517169, + "grad_norm": 1.2410768321150984, + "learning_rate": 1.9905128179919725e-05, + "loss": 1.1218, + "step": 428 + }, + { + "epoch": 0.07274268757948284, + "grad_norm": 1.1685294001209103, + "learning_rate": 1.9904371932825407e-05, + "loss": 1.0218, + "step": 429 + }, + { + "epoch": 0.07291225095379399, + "grad_norm": 1.1154083036700704, + "learning_rate": 1.9903612698047387e-05, + "loss": 1.0799, + "step": 430 + }, + { + "epoch": 0.07308181432810514, + "grad_norm": 1.0161682428773147, + "learning_rate": 1.9902850475814692e-05, + "loss": 0.927, + "step": 431 + }, + { + "epoch": 0.07325137770241628, + "grad_norm": 1.3231426104259807, + "learning_rate": 1.9902085266357248e-05, + "loss": 1.0861, + "step": 432 + }, + { + "epoch": 0.07342094107672743, + "grad_norm": 1.2021291574427906, + "learning_rate": 1.990131706990589e-05, + "loss": 1.0588, + "step": 433 + }, + { + "epoch": 0.07359050445103858, + "grad_norm": 1.2317625279376887, + "learning_rate": 1.9900545886692334e-05, + "loss": 1.0861, + "step": 434 + }, + { + "epoch": 0.07376006782534972, + "grad_norm": 1.2876226099619001, + "learning_rate": 1.9899771716949218e-05, + "loss": 1.0898, + "step": 435 + }, + { + "epoch": 0.07392963119966087, + "grad_norm": 1.2424658053018574, + "learning_rate": 1.9898994560910074e-05, + "loss": 1.0702, + "step": 436 + }, + { + "epoch": 0.07409919457397202, + "grad_norm": 1.1700524463892321, + "learning_rate": 1.989821441880933e-05, + "loss": 1.0348, + "step": 437 + }, + { + "epoch": 0.07426875794828317, + "grad_norm": 1.0849115703136105, + "learning_rate": 1.9897431290882317e-05, + "loss": 1.0095, + "step": 438 + }, + { + "epoch": 0.07443832132259431, + "grad_norm": 1.239475681510119, + "learning_rate": 1.989664517736527e-05, + "loss": 1.1206, + "step": 439 + }, + { + "epoch": 0.07460788469690546, + "grad_norm": 1.208914526371448, + "learning_rate": 1.9895856078495326e-05, + "loss": 1.0925, + "step": 440 + }, + { + "epoch": 0.07477744807121661, + "grad_norm": 1.2284241418375532, + "learning_rate": 1.9895063994510512e-05, + "loss": 1.0952, + "step": 441 + }, + { + "epoch": 0.07494701144552776, + "grad_norm": 1.083499924595316, + "learning_rate": 1.9894268925649768e-05, + "loss": 1.0705, + "step": 442 + }, + { + "epoch": 0.07511657481983891, + "grad_norm": 1.1242864575942213, + "learning_rate": 1.9893470872152925e-05, + "loss": 1.0628, + "step": 443 + }, + { + "epoch": 0.07528613819415006, + "grad_norm": 1.1679463150918064, + "learning_rate": 1.989266983426072e-05, + "loss": 1.0775, + "step": 444 + }, + { + "epoch": 0.07545570156846121, + "grad_norm": 1.1855986453181477, + "learning_rate": 1.9891865812214793e-05, + "loss": 1.1036, + "step": 445 + }, + { + "epoch": 0.07562526494277236, + "grad_norm": 1.1548333884726703, + "learning_rate": 1.989105880625767e-05, + "loss": 1.0782, + "step": 446 + }, + { + "epoch": 0.0757948283170835, + "grad_norm": 1.198269179444107, + "learning_rate": 1.9890248816632795e-05, + "loss": 1.0748, + "step": 447 + }, + { + "epoch": 0.07596439169139466, + "grad_norm": 1.1535236477204367, + "learning_rate": 1.9889435843584502e-05, + "loss": 1.0728, + "step": 448 + }, + { + "epoch": 0.0761339550657058, + "grad_norm": 1.1761477415752795, + "learning_rate": 1.9888619887358024e-05, + "loss": 1.0732, + "step": 449 + }, + { + "epoch": 0.07630351844001695, + "grad_norm": 1.183110526512685, + "learning_rate": 1.9887800948199496e-05, + "loss": 1.0943, + "step": 450 + }, + { + "epoch": 0.0764730818143281, + "grad_norm": 1.195434195691069, + "learning_rate": 1.988697902635596e-05, + "loss": 1.0591, + "step": 451 + }, + { + "epoch": 0.07664264518863925, + "grad_norm": 1.1334112331426272, + "learning_rate": 1.9886154122075344e-05, + "loss": 1.0464, + "step": 452 + }, + { + "epoch": 0.0768122085629504, + "grad_norm": 1.0755156010758495, + "learning_rate": 1.988532623560649e-05, + "loss": 1.0516, + "step": 453 + }, + { + "epoch": 0.07698177193726155, + "grad_norm": 1.2680417342605343, + "learning_rate": 1.9884495367199122e-05, + "loss": 1.0968, + "step": 454 + }, + { + "epoch": 0.0771513353115727, + "grad_norm": 1.1523661937514382, + "learning_rate": 1.9883661517103884e-05, + "loss": 1.0457, + "step": 455 + }, + { + "epoch": 0.07732089868588385, + "grad_norm": 1.1502546313529902, + "learning_rate": 1.9882824685572304e-05, + "loss": 1.0529, + "step": 456 + }, + { + "epoch": 0.077490462060195, + "grad_norm": 1.033935925511959, + "learning_rate": 1.988198487285682e-05, + "loss": 1.0348, + "step": 457 + }, + { + "epoch": 0.07766002543450615, + "grad_norm": 1.1462897653872424, + "learning_rate": 1.9881142079210757e-05, + "loss": 1.0954, + "step": 458 + }, + { + "epoch": 0.0778295888088173, + "grad_norm": 1.1248230302181292, + "learning_rate": 1.988029630488835e-05, + "loss": 1.0496, + "step": 459 + }, + { + "epoch": 0.07799915218312845, + "grad_norm": 1.1537385796926145, + "learning_rate": 1.987944755014473e-05, + "loss": 1.0485, + "step": 460 + }, + { + "epoch": 0.0781687155574396, + "grad_norm": 1.1541762723860756, + "learning_rate": 1.987859581523593e-05, + "loss": 1.1034, + "step": 461 + }, + { + "epoch": 0.07833827893175074, + "grad_norm": 1.1996045349054125, + "learning_rate": 1.987774110041887e-05, + "loss": 1.0856, + "step": 462 + }, + { + "epoch": 0.07850784230606189, + "grad_norm": 1.0105469779199656, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.8399, + "step": 463 + }, + { + "epoch": 0.07867740568037304, + "grad_norm": 1.1632808203427976, + "learning_rate": 1.987602273209219e-05, + "loss": 1.0525, + "step": 464 + }, + { + "epoch": 0.07884696905468419, + "grad_norm": 1.2955480396720245, + "learning_rate": 1.9875159079100917e-05, + "loss": 1.0818, + "step": 465 + }, + { + "epoch": 0.07901653242899534, + "grad_norm": 1.233715775378925, + "learning_rate": 1.9874292447238094e-05, + "loss": 1.1598, + "step": 466 + }, + { + "epoch": 0.07918609580330649, + "grad_norm": 1.1292694532160477, + "learning_rate": 1.9873422836765138e-05, + "loss": 1.1032, + "step": 467 + }, + { + "epoch": 0.07935565917761764, + "grad_norm": 0.7733807873715602, + "learning_rate": 1.9872550247944372e-05, + "loss": 0.8492, + "step": 468 + }, + { + "epoch": 0.07952522255192879, + "grad_norm": 1.1583804795274268, + "learning_rate": 1.9871674681039013e-05, + "loss": 1.0997, + "step": 469 + }, + { + "epoch": 0.07969478592623994, + "grad_norm": 1.1945572185922984, + "learning_rate": 1.987079613631318e-05, + "loss": 1.0695, + "step": 470 + }, + { + "epoch": 0.07986434930055109, + "grad_norm": 1.1787525634654576, + "learning_rate": 1.9869914614031886e-05, + "loss": 1.0628, + "step": 471 + }, + { + "epoch": 0.08003391267486223, + "grad_norm": 1.1352104766373525, + "learning_rate": 1.9869030114461044e-05, + "loss": 1.0582, + "step": 472 + }, + { + "epoch": 0.08020347604917338, + "grad_norm": 1.1993409165710693, + "learning_rate": 1.9868142637867474e-05, + "loss": 1.119, + "step": 473 + }, + { + "epoch": 0.08037303942348453, + "grad_norm": 1.0885290693388283, + "learning_rate": 1.9867252184518878e-05, + "loss": 1.0691, + "step": 474 + }, + { + "epoch": 0.08054260279779568, + "grad_norm": 1.1511887537900336, + "learning_rate": 1.9866358754683864e-05, + "loss": 1.097, + "step": 475 + }, + { + "epoch": 0.08071216617210683, + "grad_norm": 1.1335862921683832, + "learning_rate": 1.9865462348631945e-05, + "loss": 1.0698, + "step": 476 + }, + { + "epoch": 0.08088172954641798, + "grad_norm": 1.1208500382557103, + "learning_rate": 1.9864562966633517e-05, + "loss": 1.0879, + "step": 477 + }, + { + "epoch": 0.08105129292072913, + "grad_norm": 1.1870491416374378, + "learning_rate": 1.9863660608959885e-05, + "loss": 1.1018, + "step": 478 + }, + { + "epoch": 0.08122085629504028, + "grad_norm": 1.1444174404671785, + "learning_rate": 1.9862755275883248e-05, + "loss": 1.0877, + "step": 479 + }, + { + "epoch": 0.08139041966935143, + "grad_norm": 1.1691301307951676, + "learning_rate": 1.98618469676767e-05, + "loss": 1.1, + "step": 480 + }, + { + "epoch": 0.08155998304366256, + "grad_norm": 1.114283352536375, + "learning_rate": 1.9860935684614235e-05, + "loss": 1.0765, + "step": 481 + }, + { + "epoch": 0.08172954641797371, + "grad_norm": 1.169320341834926, + "learning_rate": 1.986002142697075e-05, + "loss": 1.063, + "step": 482 + }, + { + "epoch": 0.08189910979228486, + "grad_norm": 1.1200625962873132, + "learning_rate": 1.9859104195022026e-05, + "loss": 1.0414, + "step": 483 + }, + { + "epoch": 0.08206867316659601, + "grad_norm": 1.103798371676376, + "learning_rate": 1.985818398904475e-05, + "loss": 1.0809, + "step": 484 + }, + { + "epoch": 0.08223823654090716, + "grad_norm": 1.1553070463371595, + "learning_rate": 1.985726080931651e-05, + "loss": 1.11, + "step": 485 + }, + { + "epoch": 0.08240779991521831, + "grad_norm": 1.1803341454465903, + "learning_rate": 1.9856334656115785e-05, + "loss": 1.0621, + "step": 486 + }, + { + "epoch": 0.08257736328952946, + "grad_norm": 1.058097820682474, + "learning_rate": 1.9855405529721944e-05, + "loss": 1.0407, + "step": 487 + }, + { + "epoch": 0.0827469266638406, + "grad_norm": 1.1599368042958134, + "learning_rate": 1.985447343041527e-05, + "loss": 1.0688, + "step": 488 + }, + { + "epoch": 0.08291649003815176, + "grad_norm": 1.1129369451006759, + "learning_rate": 1.9853538358476933e-05, + "loss": 1.0742, + "step": 489 + }, + { + "epoch": 0.0830860534124629, + "grad_norm": 1.2048572648077327, + "learning_rate": 1.9852600314188993e-05, + "loss": 1.1236, + "step": 490 + }, + { + "epoch": 0.08325561678677405, + "grad_norm": 1.142848586187954, + "learning_rate": 1.985165929783442e-05, + "loss": 1.0631, + "step": 491 + }, + { + "epoch": 0.0834251801610852, + "grad_norm": 1.2162601651798566, + "learning_rate": 1.9850715309697076e-05, + "loss": 1.1205, + "step": 492 + }, + { + "epoch": 0.08359474353539635, + "grad_norm": 1.096956853782461, + "learning_rate": 1.984976835006171e-05, + "loss": 1.0877, + "step": 493 + }, + { + "epoch": 0.0837643069097075, + "grad_norm": 1.1253873071080505, + "learning_rate": 1.984881841921398e-05, + "loss": 1.0549, + "step": 494 + }, + { + "epoch": 0.08393387028401865, + "grad_norm": 1.1658881473741232, + "learning_rate": 1.9847865517440438e-05, + "loss": 1.1008, + "step": 495 + }, + { + "epoch": 0.0841034336583298, + "grad_norm": 1.146384404240851, + "learning_rate": 1.9846909645028524e-05, + "loss": 1.0555, + "step": 496 + }, + { + "epoch": 0.08427299703264095, + "grad_norm": 1.095194900303575, + "learning_rate": 1.9845950802266584e-05, + "loss": 1.1031, + "step": 497 + }, + { + "epoch": 0.0844425604069521, + "grad_norm": 1.1769694437884834, + "learning_rate": 1.9844988989443856e-05, + "loss": 1.082, + "step": 498 + }, + { + "epoch": 0.08461212378126325, + "grad_norm": 1.086331279711683, + "learning_rate": 1.984402420685047e-05, + "loss": 1.0752, + "step": 499 + }, + { + "epoch": 0.0847816871555744, + "grad_norm": 1.1366979001490798, + "learning_rate": 1.9843056454777464e-05, + "loss": 1.1047, + "step": 500 + }, + { + "epoch": 0.08495125052988554, + "grad_norm": 1.131429576014002, + "learning_rate": 1.9842085733516753e-05, + "loss": 1.0558, + "step": 501 + }, + { + "epoch": 0.0851208139041967, + "grad_norm": 0.8769720882940868, + "learning_rate": 1.984111204336116e-05, + "loss": 0.9077, + "step": 502 + }, + { + "epoch": 0.08529037727850784, + "grad_norm": 1.1598437085571796, + "learning_rate": 1.984013538460441e-05, + "loss": 1.0865, + "step": 503 + }, + { + "epoch": 0.08545994065281899, + "grad_norm": 1.1129551006550698, + "learning_rate": 1.9839155757541106e-05, + "loss": 1.0239, + "step": 504 + }, + { + "epoch": 0.08562950402713014, + "grad_norm": 1.0784083630482575, + "learning_rate": 1.983817316246676e-05, + "loss": 1.0725, + "step": 505 + }, + { + "epoch": 0.08579906740144129, + "grad_norm": 1.0928281943237104, + "learning_rate": 1.983718759967777e-05, + "loss": 1.0575, + "step": 506 + }, + { + "epoch": 0.08596863077575244, + "grad_norm": 1.1205149449144562, + "learning_rate": 1.983619906947144e-05, + "loss": 1.0827, + "step": 507 + }, + { + "epoch": 0.08613819415006359, + "grad_norm": 1.100239093870298, + "learning_rate": 1.9835207572145957e-05, + "loss": 1.081, + "step": 508 + }, + { + "epoch": 0.08630775752437474, + "grad_norm": 1.1349372799313462, + "learning_rate": 1.9834213108000414e-05, + "loss": 1.0464, + "step": 509 + }, + { + "epoch": 0.08647732089868589, + "grad_norm": 1.091347043818639, + "learning_rate": 1.983321567733479e-05, + "loss": 1.0594, + "step": 510 + }, + { + "epoch": 0.08664688427299704, + "grad_norm": 1.1057906831239117, + "learning_rate": 1.983221528044997e-05, + "loss": 1.0766, + "step": 511 + }, + { + "epoch": 0.08681644764730818, + "grad_norm": 1.091013204329108, + "learning_rate": 1.9831211917647723e-05, + "loss": 1.0584, + "step": 512 + }, + { + "epoch": 0.08698601102161933, + "grad_norm": 1.1230243076329225, + "learning_rate": 1.9830205589230713e-05, + "loss": 1.0839, + "step": 513 + }, + { + "epoch": 0.08715557439593048, + "grad_norm": 1.1487894168527573, + "learning_rate": 1.9829196295502506e-05, + "loss": 1.1103, + "step": 514 + }, + { + "epoch": 0.08732513777024163, + "grad_norm": 1.1139963729673388, + "learning_rate": 1.9828184036767556e-05, + "loss": 1.1076, + "step": 515 + }, + { + "epoch": 0.08749470114455278, + "grad_norm": 1.1377141444715826, + "learning_rate": 1.9827168813331215e-05, + "loss": 1.0757, + "step": 516 + }, + { + "epoch": 0.08766426451886393, + "grad_norm": 1.0505504938040815, + "learning_rate": 1.9826150625499733e-05, + "loss": 1.0078, + "step": 517 + }, + { + "epoch": 0.08783382789317508, + "grad_norm": 1.194722335388699, + "learning_rate": 1.982512947358024e-05, + "loss": 1.0953, + "step": 518 + }, + { + "epoch": 0.08800339126748623, + "grad_norm": 1.0972133835062998, + "learning_rate": 1.9824105357880777e-05, + "loss": 1.1141, + "step": 519 + }, + { + "epoch": 0.08817295464179738, + "grad_norm": 1.0912509502482381, + "learning_rate": 1.9823078278710265e-05, + "loss": 1.0922, + "step": 520 + }, + { + "epoch": 0.08834251801610853, + "grad_norm": 1.184452496622127, + "learning_rate": 1.9822048236378536e-05, + "loss": 1.0567, + "step": 521 + }, + { + "epoch": 0.08851208139041968, + "grad_norm": 1.2015748250956637, + "learning_rate": 1.982101523119629e-05, + "loss": 1.1245, + "step": 522 + }, + { + "epoch": 0.08868164476473082, + "grad_norm": 1.0301949286097938, + "learning_rate": 1.9819979263475154e-05, + "loss": 1.0559, + "step": 523 + }, + { + "epoch": 0.08885120813904197, + "grad_norm": 1.1280944704408538, + "learning_rate": 1.9818940333527616e-05, + "loss": 1.0607, + "step": 524 + }, + { + "epoch": 0.08902077151335312, + "grad_norm": 1.1243101999950735, + "learning_rate": 1.9817898441667082e-05, + "loss": 1.1164, + "step": 525 + }, + { + "epoch": 0.08919033488766426, + "grad_norm": 1.104946151048045, + "learning_rate": 1.9816853588207835e-05, + "loss": 1.0333, + "step": 526 + }, + { + "epoch": 0.0893598982619754, + "grad_norm": 1.0842306029656459, + "learning_rate": 1.9815805773465064e-05, + "loss": 1.0593, + "step": 527 + }, + { + "epoch": 0.08952946163628656, + "grad_norm": 1.126482095627671, + "learning_rate": 1.981475499775484e-05, + "loss": 1.0655, + "step": 528 + }, + { + "epoch": 0.0896990250105977, + "grad_norm": 1.0729177558590441, + "learning_rate": 1.9813701261394136e-05, + "loss": 1.094, + "step": 529 + }, + { + "epoch": 0.08986858838490885, + "grad_norm": 1.1555215297008465, + "learning_rate": 1.9812644564700814e-05, + "loss": 1.0782, + "step": 530 + }, + { + "epoch": 0.09003815175922, + "grad_norm": 1.1445213131107626, + "learning_rate": 1.9811584907993632e-05, + "loss": 1.0973, + "step": 531 + }, + { + "epoch": 0.09020771513353115, + "grad_norm": 1.1395691634802012, + "learning_rate": 1.9810522291592234e-05, + "loss": 1.0558, + "step": 532 + }, + { + "epoch": 0.0903772785078423, + "grad_norm": 1.1494803768817405, + "learning_rate": 1.9809456715817163e-05, + "loss": 1.0352, + "step": 533 + }, + { + "epoch": 0.09054684188215345, + "grad_norm": 1.1211189975039118, + "learning_rate": 1.980838818098986e-05, + "loss": 1.0759, + "step": 534 + }, + { + "epoch": 0.0907164052564646, + "grad_norm": 1.1593061242144942, + "learning_rate": 1.9807316687432637e-05, + "loss": 1.0562, + "step": 535 + }, + { + "epoch": 0.09088596863077575, + "grad_norm": 1.0748556332850687, + "learning_rate": 1.980624223546873e-05, + "loss": 1.0293, + "step": 536 + }, + { + "epoch": 0.0910555320050869, + "grad_norm": 1.1919672014479494, + "learning_rate": 1.980516482542224e-05, + "loss": 1.0528, + "step": 537 + }, + { + "epoch": 0.09122509537939805, + "grad_norm": 1.1179549392680466, + "learning_rate": 1.980408445761817e-05, + "loss": 1.0065, + "step": 538 + }, + { + "epoch": 0.0913946587537092, + "grad_norm": 1.072876811601526, + "learning_rate": 1.9803001132382423e-05, + "loss": 1.076, + "step": 539 + }, + { + "epoch": 0.09156422212802034, + "grad_norm": 1.1689336105605155, + "learning_rate": 1.9801914850041787e-05, + "loss": 1.1354, + "step": 540 + }, + { + "epoch": 0.0917337855023315, + "grad_norm": 1.1462606137452056, + "learning_rate": 1.9800825610923937e-05, + "loss": 1.0495, + "step": 541 + }, + { + "epoch": 0.09190334887664264, + "grad_norm": 1.0869286432944305, + "learning_rate": 1.979973341535745e-05, + "loss": 1.0846, + "step": 542 + }, + { + "epoch": 0.09207291225095379, + "grad_norm": 1.160874012440214, + "learning_rate": 1.9798638263671783e-05, + "loss": 1.0993, + "step": 543 + }, + { + "epoch": 0.09224247562526494, + "grad_norm": 1.1069748287382373, + "learning_rate": 1.9797540156197298e-05, + "loss": 1.0531, + "step": 544 + }, + { + "epoch": 0.09241203899957609, + "grad_norm": 1.0619426126639204, + "learning_rate": 1.9796439093265245e-05, + "loss": 1.0443, + "step": 545 + }, + { + "epoch": 0.09258160237388724, + "grad_norm": 1.0900576991203947, + "learning_rate": 1.9795335075207756e-05, + "loss": 1.087, + "step": 546 + }, + { + "epoch": 0.09275116574819839, + "grad_norm": 1.109818697228551, + "learning_rate": 1.9794228102357868e-05, + "loss": 1.0757, + "step": 547 + }, + { + "epoch": 0.09292072912250954, + "grad_norm": 1.121649359705272, + "learning_rate": 1.9793118175049497e-05, + "loss": 1.0894, + "step": 548 + }, + { + "epoch": 0.09309029249682069, + "grad_norm": 1.1407704380093286, + "learning_rate": 1.979200529361746e-05, + "loss": 1.0918, + "step": 549 + }, + { + "epoch": 0.09325985587113184, + "grad_norm": 1.0968320814736103, + "learning_rate": 1.9790889458397457e-05, + "loss": 1.0652, + "step": 550 + }, + { + "epoch": 0.09342941924544299, + "grad_norm": 1.1569327839984382, + "learning_rate": 1.9789770669726088e-05, + "loss": 1.0806, + "step": 551 + }, + { + "epoch": 0.09359898261975413, + "grad_norm": 1.130038417534701, + "learning_rate": 1.9788648927940833e-05, + "loss": 1.064, + "step": 552 + }, + { + "epoch": 0.09376854599406528, + "grad_norm": 1.0870086002941353, + "learning_rate": 1.9787524233380076e-05, + "loss": 1.0832, + "step": 553 + }, + { + "epoch": 0.09393810936837643, + "grad_norm": 1.1104865662961882, + "learning_rate": 1.9786396586383078e-05, + "loss": 1.0809, + "step": 554 + }, + { + "epoch": 0.09410767274268758, + "grad_norm": 1.115877825942729, + "learning_rate": 1.978526598729e-05, + "loss": 1.0717, + "step": 555 + }, + { + "epoch": 0.09427723611699873, + "grad_norm": 1.2313468318597176, + "learning_rate": 1.9784132436441888e-05, + "loss": 1.0769, + "step": 556 + }, + { + "epoch": 0.09444679949130988, + "grad_norm": 1.1522440263094618, + "learning_rate": 1.9782995934180687e-05, + "loss": 1.1218, + "step": 557 + }, + { + "epoch": 0.09461636286562103, + "grad_norm": 1.1627190083653198, + "learning_rate": 1.978185648084922e-05, + "loss": 1.0797, + "step": 558 + }, + { + "epoch": 0.09478592623993218, + "grad_norm": 1.1180499876689296, + "learning_rate": 1.978071407679121e-05, + "loss": 1.0429, + "step": 559 + }, + { + "epoch": 0.09495548961424333, + "grad_norm": 1.0667542106344945, + "learning_rate": 1.977956872235127e-05, + "loss": 1.0663, + "step": 560 + }, + { + "epoch": 0.09512505298855448, + "grad_norm": 1.1288587719208103, + "learning_rate": 1.9778420417874894e-05, + "loss": 1.0587, + "step": 561 + }, + { + "epoch": 0.09529461636286563, + "grad_norm": 0.7695075463461137, + "learning_rate": 1.977726916370847e-05, + "loss": 0.9117, + "step": 562 + }, + { + "epoch": 0.09546417973717677, + "grad_norm": 1.1902697873927301, + "learning_rate": 1.9776114960199283e-05, + "loss": 1.0674, + "step": 563 + }, + { + "epoch": 0.09563374311148792, + "grad_norm": 1.1277515967187348, + "learning_rate": 1.97749578076955e-05, + "loss": 1.0948, + "step": 564 + }, + { + "epoch": 0.09580330648579907, + "grad_norm": 1.126005646590579, + "learning_rate": 1.9773797706546176e-05, + "loss": 1.0254, + "step": 565 + }, + { + "epoch": 0.09597286986011022, + "grad_norm": 1.1384127107672, + "learning_rate": 1.9772634657101263e-05, + "loss": 1.0632, + "step": 566 + }, + { + "epoch": 0.09614243323442137, + "grad_norm": 1.0931297728405247, + "learning_rate": 1.9771468659711595e-05, + "loss": 1.052, + "step": 567 + }, + { + "epoch": 0.09631199660873252, + "grad_norm": 1.1482837794662704, + "learning_rate": 1.9770299714728908e-05, + "loss": 1.0569, + "step": 568 + }, + { + "epoch": 0.09648155998304367, + "grad_norm": 1.2170875755793362, + "learning_rate": 1.9769127822505805e-05, + "loss": 1.0873, + "step": 569 + }, + { + "epoch": 0.09665112335735482, + "grad_norm": 1.098153826711913, + "learning_rate": 1.9767952983395795e-05, + "loss": 1.1112, + "step": 570 + }, + { + "epoch": 0.09682068673166597, + "grad_norm": 1.06714468236688, + "learning_rate": 1.9766775197753276e-05, + "loss": 1.0605, + "step": 571 + }, + { + "epoch": 0.0969902501059771, + "grad_norm": 1.1021341617313847, + "learning_rate": 1.976559446593353e-05, + "loss": 1.0405, + "step": 572 + }, + { + "epoch": 0.09715981348028825, + "grad_norm": 1.2657459951125931, + "learning_rate": 1.9764410788292724e-05, + "loss": 1.0752, + "step": 573 + }, + { + "epoch": 0.0973293768545994, + "grad_norm": 1.1382173754696283, + "learning_rate": 1.9763224165187918e-05, + "loss": 1.0391, + "step": 574 + }, + { + "epoch": 0.09749894022891055, + "grad_norm": 1.1115137363027052, + "learning_rate": 1.9762034596977066e-05, + "loss": 1.0602, + "step": 575 + }, + { + "epoch": 0.0976685036032217, + "grad_norm": 1.2638434665667782, + "learning_rate": 1.9760842084019e-05, + "loss": 1.1028, + "step": 576 + }, + { + "epoch": 0.09783806697753285, + "grad_norm": 1.0956643884164092, + "learning_rate": 1.9759646626673445e-05, + "loss": 1.0383, + "step": 577 + }, + { + "epoch": 0.098007630351844, + "grad_norm": 1.1951666900164801, + "learning_rate": 1.9758448225301018e-05, + "loss": 1.0522, + "step": 578 + }, + { + "epoch": 0.09817719372615515, + "grad_norm": 1.1582403573338345, + "learning_rate": 1.975724688026322e-05, + "loss": 1.0759, + "step": 579 + }, + { + "epoch": 0.0983467571004663, + "grad_norm": 1.105896806627842, + "learning_rate": 1.9756042591922436e-05, + "loss": 1.092, + "step": 580 + }, + { + "epoch": 0.09851632047477744, + "grad_norm": 1.0743625525603464, + "learning_rate": 1.9754835360641945e-05, + "loss": 1.0617, + "step": 581 + }, + { + "epoch": 0.0986858838490886, + "grad_norm": 1.1252124902043465, + "learning_rate": 1.9753625186785915e-05, + "loss": 1.0746, + "step": 582 + }, + { + "epoch": 0.09885544722339974, + "grad_norm": 1.1017954423823568, + "learning_rate": 1.9752412070719394e-05, + "loss": 1.072, + "step": 583 + }, + { + "epoch": 0.09902501059771089, + "grad_norm": 1.136591595976166, + "learning_rate": 1.9751196012808328e-05, + "loss": 1.0859, + "step": 584 + }, + { + "epoch": 0.09919457397202204, + "grad_norm": 1.1120914379350122, + "learning_rate": 1.9749977013419536e-05, + "loss": 1.0326, + "step": 585 + }, + { + "epoch": 0.09936413734633319, + "grad_norm": 1.0843116916903461, + "learning_rate": 1.974875507292074e-05, + "loss": 1.0523, + "step": 586 + }, + { + "epoch": 0.09953370072064434, + "grad_norm": 1.1190010103242294, + "learning_rate": 1.9747530191680543e-05, + "loss": 1.0742, + "step": 587 + }, + { + "epoch": 0.09970326409495549, + "grad_norm": 1.065987641581167, + "learning_rate": 1.974630237006843e-05, + "loss": 1.0643, + "step": 588 + }, + { + "epoch": 0.09987282746926664, + "grad_norm": 1.047751061453114, + "learning_rate": 1.9745071608454777e-05, + "loss": 1.0378, + "step": 589 + }, + { + "epoch": 0.10004239084357779, + "grad_norm": 1.1259994031260914, + "learning_rate": 1.9743837907210847e-05, + "loss": 1.0477, + "step": 590 + }, + { + "epoch": 0.10021195421788893, + "grad_norm": 1.1236119296177303, + "learning_rate": 1.9742601266708794e-05, + "loss": 1.1015, + "step": 591 + }, + { + "epoch": 0.10038151759220008, + "grad_norm": 1.129665111459205, + "learning_rate": 1.974136168732165e-05, + "loss": 1.1012, + "step": 592 + }, + { + "epoch": 0.10055108096651123, + "grad_norm": 1.1243761191755925, + "learning_rate": 1.9740119169423337e-05, + "loss": 1.0143, + "step": 593 + }, + { + "epoch": 0.10072064434082238, + "grad_norm": 1.1537337440303301, + "learning_rate": 1.973887371338867e-05, + "loss": 1.0616, + "step": 594 + }, + { + "epoch": 0.10089020771513353, + "grad_norm": 1.0906142964884253, + "learning_rate": 1.9737625319593338e-05, + "loss": 1.0391, + "step": 595 + }, + { + "epoch": 0.10105977108944468, + "grad_norm": 1.158231936547035, + "learning_rate": 1.9736373988413923e-05, + "loss": 1.0965, + "step": 596 + }, + { + "epoch": 0.10122933446375583, + "grad_norm": 1.15562873277257, + "learning_rate": 1.97351197202279e-05, + "loss": 1.0914, + "step": 597 + }, + { + "epoch": 0.10139889783806698, + "grad_norm": 1.1373645334454423, + "learning_rate": 1.9733862515413616e-05, + "loss": 1.0819, + "step": 598 + }, + { + "epoch": 0.10156846121237813, + "grad_norm": 1.1791280926929748, + "learning_rate": 1.9732602374350314e-05, + "loss": 1.1079, + "step": 599 + }, + { + "epoch": 0.10173802458668928, + "grad_norm": 1.1273615661016252, + "learning_rate": 1.973133929741812e-05, + "loss": 1.1228, + "step": 600 + }, + { + "epoch": 0.10190758796100043, + "grad_norm": 1.1612422740440762, + "learning_rate": 1.973007328499804e-05, + "loss": 1.0937, + "step": 601 + }, + { + "epoch": 0.10207715133531158, + "grad_norm": 1.143726584306414, + "learning_rate": 1.9728804337471974e-05, + "loss": 1.0559, + "step": 602 + }, + { + "epoch": 0.10224671470962272, + "grad_norm": 1.1862024806139606, + "learning_rate": 1.9727532455222707e-05, + "loss": 1.0453, + "step": 603 + }, + { + "epoch": 0.10241627808393387, + "grad_norm": 1.062593200351274, + "learning_rate": 1.9726257638633898e-05, + "loss": 1.0438, + "step": 604 + }, + { + "epoch": 0.10258584145824502, + "grad_norm": 1.1571042199635704, + "learning_rate": 1.972497988809011e-05, + "loss": 1.0528, + "step": 605 + }, + { + "epoch": 0.10275540483255617, + "grad_norm": 1.2193278886994043, + "learning_rate": 1.9723699203976768e-05, + "loss": 1.08, + "step": 606 + }, + { + "epoch": 0.10292496820686732, + "grad_norm": 1.081899882791445, + "learning_rate": 1.9722415586680204e-05, + "loss": 1.0869, + "step": 607 + }, + { + "epoch": 0.10309453158117847, + "grad_norm": 1.1578031935579174, + "learning_rate": 1.9721129036587618e-05, + "loss": 1.094, + "step": 608 + }, + { + "epoch": 0.10326409495548962, + "grad_norm": 1.146092718487579, + "learning_rate": 1.9719839554087108e-05, + "loss": 1.0539, + "step": 609 + }, + { + "epoch": 0.10343365832980077, + "grad_norm": 1.086646725053304, + "learning_rate": 1.9718547139567648e-05, + "loss": 1.0328, + "step": 610 + }, + { + "epoch": 0.10360322170411192, + "grad_norm": 1.1486718723623022, + "learning_rate": 1.9717251793419097e-05, + "loss": 1.0359, + "step": 611 + }, + { + "epoch": 0.10377278507842307, + "grad_norm": 1.125662041139612, + "learning_rate": 1.97159535160322e-05, + "loss": 1.0515, + "step": 612 + }, + { + "epoch": 0.10394234845273422, + "grad_norm": 1.1430135256236766, + "learning_rate": 1.971465230779859e-05, + "loss": 1.1003, + "step": 613 + }, + { + "epoch": 0.10411191182704536, + "grad_norm": 1.1642877319851186, + "learning_rate": 1.9713348169110776e-05, + "loss": 1.0929, + "step": 614 + }, + { + "epoch": 0.10428147520135651, + "grad_norm": 1.1068291985391228, + "learning_rate": 1.971204110036216e-05, + "loss": 1.0841, + "step": 615 + }, + { + "epoch": 0.10445103857566766, + "grad_norm": 1.1578356259788904, + "learning_rate": 1.971073110194702e-05, + "loss": 1.0424, + "step": 616 + }, + { + "epoch": 0.10462060194997881, + "grad_norm": 1.11993680561627, + "learning_rate": 1.9709418174260523e-05, + "loss": 1.0873, + "step": 617 + }, + { + "epoch": 0.10479016532428995, + "grad_norm": 1.1008254657288135, + "learning_rate": 1.970810231769871e-05, + "loss": 1.0719, + "step": 618 + }, + { + "epoch": 0.1049597286986011, + "grad_norm": 1.0926669021841884, + "learning_rate": 1.9706783532658528e-05, + "loss": 1.07, + "step": 619 + }, + { + "epoch": 0.10512929207291224, + "grad_norm": 1.1041001530651953, + "learning_rate": 1.9705461819537776e-05, + "loss": 1.0338, + "step": 620 + }, + { + "epoch": 0.1052988554472234, + "grad_norm": 1.101936914258299, + "learning_rate": 1.9704137178735164e-05, + "loss": 1.0456, + "step": 621 + }, + { + "epoch": 0.10546841882153454, + "grad_norm": 1.120099123347527, + "learning_rate": 1.9702809610650272e-05, + "loss": 1.0437, + "step": 622 + }, + { + "epoch": 0.10563798219584569, + "grad_norm": 1.1192875521384187, + "learning_rate": 1.9701479115683562e-05, + "loss": 1.1211, + "step": 623 + }, + { + "epoch": 0.10580754557015684, + "grad_norm": 1.0904903343796983, + "learning_rate": 1.9700145694236384e-05, + "loss": 1.0815, + "step": 624 + }, + { + "epoch": 0.10597710894446799, + "grad_norm": 1.086279720601418, + "learning_rate": 1.9698809346710965e-05, + "loss": 1.0601, + "step": 625 + }, + { + "epoch": 0.10614667231877914, + "grad_norm": 0.6752048722044736, + "learning_rate": 1.9697470073510425e-05, + "loss": 0.8137, + "step": 626 + }, + { + "epoch": 0.10631623569309029, + "grad_norm": 1.1434325266029355, + "learning_rate": 1.9696127875038753e-05, + "loss": 1.0607, + "step": 627 + }, + { + "epoch": 0.10648579906740144, + "grad_norm": 1.2106579149348138, + "learning_rate": 1.969478275170083e-05, + "loss": 1.0624, + "step": 628 + }, + { + "epoch": 0.10665536244171259, + "grad_norm": 1.0834147893709303, + "learning_rate": 1.9693434703902417e-05, + "loss": 1.0591, + "step": 629 + }, + { + "epoch": 0.10682492581602374, + "grad_norm": 1.1007392254472879, + "learning_rate": 1.9692083732050157e-05, + "loss": 1.0403, + "step": 630 + }, + { + "epoch": 0.10699448919033488, + "grad_norm": 1.079640446936186, + "learning_rate": 1.9690729836551576e-05, + "loss": 1.0558, + "step": 631 + }, + { + "epoch": 0.10716405256464603, + "grad_norm": 1.1995413113343978, + "learning_rate": 1.9689373017815076e-05, + "loss": 1.1018, + "step": 632 + }, + { + "epoch": 0.10733361593895718, + "grad_norm": 1.1201878440803128, + "learning_rate": 1.9688013276249947e-05, + "loss": 1.0226, + "step": 633 + }, + { + "epoch": 0.10750317931326833, + "grad_norm": 1.1020687785893686, + "learning_rate": 1.9686650612266364e-05, + "loss": 1.0367, + "step": 634 + }, + { + "epoch": 0.10767274268757948, + "grad_norm": 0.6677923041852281, + "learning_rate": 1.968528502627537e-05, + "loss": 0.8784, + "step": 635 + }, + { + "epoch": 0.10784230606189063, + "grad_norm": 1.146816649061468, + "learning_rate": 1.968391651868891e-05, + "loss": 1.1004, + "step": 636 + }, + { + "epoch": 0.10801186943620178, + "grad_norm": 1.159486212227261, + "learning_rate": 1.9682545089919784e-05, + "loss": 1.0884, + "step": 637 + }, + { + "epoch": 0.10818143281051293, + "grad_norm": 1.0978985243065382, + "learning_rate": 1.9681170740381703e-05, + "loss": 1.0587, + "step": 638 + }, + { + "epoch": 0.10835099618482408, + "grad_norm": 1.054387258641634, + "learning_rate": 1.967979347048923e-05, + "loss": 1.0739, + "step": 639 + }, + { + "epoch": 0.10852055955913523, + "grad_norm": 1.1656732373448082, + "learning_rate": 1.967841328065783e-05, + "loss": 1.0198, + "step": 640 + }, + { + "epoch": 0.10869012293344638, + "grad_norm": 1.0760367036003744, + "learning_rate": 1.9677030171303842e-05, + "loss": 1.0478, + "step": 641 + }, + { + "epoch": 0.10885968630775752, + "grad_norm": 1.1483066730433895, + "learning_rate": 1.9675644142844482e-05, + "loss": 1.0737, + "step": 642 + }, + { + "epoch": 0.10902924968206867, + "grad_norm": 1.1296193475236855, + "learning_rate": 1.9674255195697848e-05, + "loss": 1.0454, + "step": 643 + }, + { + "epoch": 0.10919881305637982, + "grad_norm": 1.1745512720919287, + "learning_rate": 1.967286333028293e-05, + "loss": 1.0621, + "step": 644 + }, + { + "epoch": 0.10936837643069097, + "grad_norm": 1.106699964385633, + "learning_rate": 1.9671468547019575e-05, + "loss": 1.1019, + "step": 645 + }, + { + "epoch": 0.10953793980500212, + "grad_norm": 1.1154322739929565, + "learning_rate": 1.9670070846328532e-05, + "loss": 1.0466, + "step": 646 + }, + { + "epoch": 0.10970750317931327, + "grad_norm": 1.1161325733683853, + "learning_rate": 1.9668670228631416e-05, + "loss": 1.02, + "step": 647 + }, + { + "epoch": 0.10987706655362442, + "grad_norm": 1.2100624469288324, + "learning_rate": 1.9667266694350733e-05, + "loss": 1.0725, + "step": 648 + }, + { + "epoch": 0.11004662992793557, + "grad_norm": 1.0588563151186825, + "learning_rate": 1.966586024390986e-05, + "loss": 1.0941, + "step": 649 + }, + { + "epoch": 0.11021619330224672, + "grad_norm": 1.0464532346569646, + "learning_rate": 1.9664450877733065e-05, + "loss": 1.0661, + "step": 650 + }, + { + "epoch": 0.11038575667655787, + "grad_norm": 1.110846254232341, + "learning_rate": 1.9663038596245477e-05, + "loss": 1.0509, + "step": 651 + }, + { + "epoch": 0.11055532005086902, + "grad_norm": 1.0478586310534652, + "learning_rate": 1.966162339987312e-05, + "loss": 1.0533, + "step": 652 + }, + { + "epoch": 0.11072488342518017, + "grad_norm": 1.1198595469899033, + "learning_rate": 1.9660205289042887e-05, + "loss": 1.1076, + "step": 653 + }, + { + "epoch": 0.11089444679949131, + "grad_norm": 1.03479090038442, + "learning_rate": 1.9658784264182565e-05, + "loss": 1.0756, + "step": 654 + }, + { + "epoch": 0.11106401017380246, + "grad_norm": 1.0648956277315327, + "learning_rate": 1.96573603257208e-05, + "loss": 1.0706, + "step": 655 + }, + { + "epoch": 0.11123357354811361, + "grad_norm": 1.0468737800536208, + "learning_rate": 1.9655933474087135e-05, + "loss": 1.0725, + "step": 656 + }, + { + "epoch": 0.11140313692242476, + "grad_norm": 1.0472912629096596, + "learning_rate": 1.9654503709711984e-05, + "loss": 1.0887, + "step": 657 + }, + { + "epoch": 0.11157270029673591, + "grad_norm": 1.154907312346853, + "learning_rate": 1.9653071033026635e-05, + "loss": 1.1054, + "step": 658 + }, + { + "epoch": 0.11174226367104706, + "grad_norm": 1.0523325931961225, + "learning_rate": 1.965163544446326e-05, + "loss": 1.0467, + "step": 659 + }, + { + "epoch": 0.11191182704535821, + "grad_norm": 1.134820358682994, + "learning_rate": 1.9650196944454912e-05, + "loss": 1.0879, + "step": 660 + }, + { + "epoch": 0.11208139041966936, + "grad_norm": 1.0694754851200907, + "learning_rate": 1.9648755533435517e-05, + "loss": 1.068, + "step": 661 + }, + { + "epoch": 0.1122509537939805, + "grad_norm": 1.0682206468984532, + "learning_rate": 1.9647311211839878e-05, + "loss": 1.0343, + "step": 662 + }, + { + "epoch": 0.11242051716829164, + "grad_norm": 1.1420187495252647, + "learning_rate": 1.9645863980103687e-05, + "loss": 1.0538, + "step": 663 + }, + { + "epoch": 0.11259008054260279, + "grad_norm": 0.671556464032117, + "learning_rate": 1.96444138386635e-05, + "loss": 0.8609, + "step": 664 + }, + { + "epoch": 0.11275964391691394, + "grad_norm": 1.129692988580815, + "learning_rate": 1.964296078795675e-05, + "loss": 1.0803, + "step": 665 + }, + { + "epoch": 0.11292920729122509, + "grad_norm": 1.0833946028329215, + "learning_rate": 1.9641504828421772e-05, + "loss": 1.0811, + "step": 666 + }, + { + "epoch": 0.11309877066553624, + "grad_norm": 1.1053197486821598, + "learning_rate": 1.9640045960497742e-05, + "loss": 1.0701, + "step": 667 + }, + { + "epoch": 0.11326833403984739, + "grad_norm": 1.0625837036341024, + "learning_rate": 1.9638584184624744e-05, + "loss": 1.0623, + "step": 668 + }, + { + "epoch": 0.11343789741415854, + "grad_norm": 1.123356491188546, + "learning_rate": 1.963711950124372e-05, + "loss": 1.0898, + "step": 669 + }, + { + "epoch": 0.11360746078846969, + "grad_norm": 1.019395746112488, + "learning_rate": 1.9635651910796505e-05, + "loss": 1.0126, + "step": 670 + }, + { + "epoch": 0.11377702416278083, + "grad_norm": 1.104573382764544, + "learning_rate": 1.963418141372579e-05, + "loss": 1.0903, + "step": 671 + }, + { + "epoch": 0.11394658753709198, + "grad_norm": 1.0517837405698718, + "learning_rate": 1.9632708010475166e-05, + "loss": 1.0681, + "step": 672 + }, + { + "epoch": 0.11411615091140313, + "grad_norm": 1.1181651011054694, + "learning_rate": 1.9631231701489083e-05, + "loss": 1.0921, + "step": 673 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 1.1365544432418553, + "learning_rate": 1.9629752487212875e-05, + "loss": 1.0869, + "step": 674 + }, + { + "epoch": 0.11445527766002543, + "grad_norm": 1.1386681004720671, + "learning_rate": 1.962827036809275e-05, + "loss": 1.0506, + "step": 675 + }, + { + "epoch": 0.11462484103433658, + "grad_norm": 0.6547819271104519, + "learning_rate": 1.96267853445758e-05, + "loss": 0.8654, + "step": 676 + }, + { + "epoch": 0.11479440440864773, + "grad_norm": 1.1168296079216966, + "learning_rate": 1.9625297417109982e-05, + "loss": 1.0856, + "step": 677 + }, + { + "epoch": 0.11496396778295888, + "grad_norm": 1.1578194105132944, + "learning_rate": 1.9623806586144133e-05, + "loss": 1.051, + "step": 678 + }, + { + "epoch": 0.11513353115727003, + "grad_norm": 1.1104768251752244, + "learning_rate": 1.962231285212797e-05, + "loss": 1.08, + "step": 679 + }, + { + "epoch": 0.11530309453158118, + "grad_norm": 1.122158328097251, + "learning_rate": 1.962081621551208e-05, + "loss": 1.0916, + "step": 680 + }, + { + "epoch": 0.11547265790589233, + "grad_norm": 1.107177574310001, + "learning_rate": 1.9619316676747928e-05, + "loss": 1.0516, + "step": 681 + }, + { + "epoch": 0.11564222128020347, + "grad_norm": 1.1810586326751367, + "learning_rate": 1.9617814236287856e-05, + "loss": 1.0952, + "step": 682 + }, + { + "epoch": 0.11581178465451462, + "grad_norm": 1.1585617155722383, + "learning_rate": 1.9616308894585078e-05, + "loss": 1.0629, + "step": 683 + }, + { + "epoch": 0.11598134802882577, + "grad_norm": 1.0441428322772266, + "learning_rate": 1.9614800652093685e-05, + "loss": 1.0452, + "step": 684 + }, + { + "epoch": 0.11615091140313692, + "grad_norm": 1.1703165109066946, + "learning_rate": 1.9613289509268647e-05, + "loss": 1.0712, + "step": 685 + }, + { + "epoch": 0.11632047477744807, + "grad_norm": 1.0760045428043057, + "learning_rate": 1.9611775466565797e-05, + "loss": 1.0481, + "step": 686 + }, + { + "epoch": 0.11649003815175922, + "grad_norm": 1.0544541664556326, + "learning_rate": 1.9610258524441855e-05, + "loss": 1.0559, + "step": 687 + }, + { + "epoch": 0.11665960152607037, + "grad_norm": 1.1774069881309783, + "learning_rate": 1.9608738683354413e-05, + "loss": 1.0504, + "step": 688 + }, + { + "epoch": 0.11682916490038152, + "grad_norm": 1.1322872719387318, + "learning_rate": 1.9607215943761933e-05, + "loss": 1.0403, + "step": 689 + }, + { + "epoch": 0.11699872827469267, + "grad_norm": 1.1226384251199562, + "learning_rate": 1.9605690306123755e-05, + "loss": 1.0594, + "step": 690 + }, + { + "epoch": 0.11716829164900382, + "grad_norm": 1.2064638830304784, + "learning_rate": 1.960416177090009e-05, + "loss": 1.0815, + "step": 691 + }, + { + "epoch": 0.11733785502331497, + "grad_norm": 1.1390336467265723, + "learning_rate": 1.960263033855203e-05, + "loss": 1.0897, + "step": 692 + }, + { + "epoch": 0.11750741839762611, + "grad_norm": 1.0758109815913908, + "learning_rate": 1.960109600954153e-05, + "loss": 1.0271, + "step": 693 + }, + { + "epoch": 0.11767698177193726, + "grad_norm": 1.0258838504650964, + "learning_rate": 1.959955878433143e-05, + "loss": 1.0671, + "step": 694 + }, + { + "epoch": 0.11784654514624841, + "grad_norm": 1.0518237057593784, + "learning_rate": 1.9598018663385437e-05, + "loss": 1.0289, + "step": 695 + }, + { + "epoch": 0.11801610852055956, + "grad_norm": 1.0357382821991248, + "learning_rate": 1.9596475647168133e-05, + "loss": 1.0139, + "step": 696 + }, + { + "epoch": 0.11818567189487071, + "grad_norm": 1.0819302054485556, + "learning_rate": 1.9594929736144978e-05, + "loss": 1.0851, + "step": 697 + }, + { + "epoch": 0.11835523526918186, + "grad_norm": 1.0772252705960152, + "learning_rate": 1.959338093078229e-05, + "loss": 1.0775, + "step": 698 + }, + { + "epoch": 0.11852479864349301, + "grad_norm": 1.108405657406484, + "learning_rate": 1.9591829231547278e-05, + "loss": 1.018, + "step": 699 + }, + { + "epoch": 0.11869436201780416, + "grad_norm": 1.0818438008956826, + "learning_rate": 1.9590274638908018e-05, + "loss": 1.0514, + "step": 700 + }, + { + "epoch": 0.11886392539211531, + "grad_norm": 1.1066598853050535, + "learning_rate": 1.9588717153333456e-05, + "loss": 1.0678, + "step": 701 + }, + { + "epoch": 0.11903348876642646, + "grad_norm": 0.67953328338625, + "learning_rate": 1.9587156775293408e-05, + "loss": 0.9174, + "step": 702 + }, + { + "epoch": 0.1192030521407376, + "grad_norm": 1.1181498525515325, + "learning_rate": 1.958559350525857e-05, + "loss": 1.0284, + "step": 703 + }, + { + "epoch": 0.11937261551504876, + "grad_norm": 1.040872193249211, + "learning_rate": 1.958402734370051e-05, + "loss": 0.9937, + "step": 704 + }, + { + "epoch": 0.1195421788893599, + "grad_norm": 1.1189194083937848, + "learning_rate": 1.9582458291091664e-05, + "loss": 1.0689, + "step": 705 + }, + { + "epoch": 0.11971174226367105, + "grad_norm": 1.11133344212292, + "learning_rate": 1.9580886347905338e-05, + "loss": 1.0595, + "step": 706 + }, + { + "epoch": 0.1198813056379822, + "grad_norm": 1.1233961546509057, + "learning_rate": 1.957931151461572e-05, + "loss": 1.0562, + "step": 707 + }, + { + "epoch": 0.12005086901229335, + "grad_norm": 1.1131976361459097, + "learning_rate": 1.957773379169785e-05, + "loss": 1.0864, + "step": 708 + }, + { + "epoch": 0.12022043238660449, + "grad_norm": 1.0394011428209982, + "learning_rate": 1.957615317962767e-05, + "loss": 1.065, + "step": 709 + }, + { + "epoch": 0.12038999576091564, + "grad_norm": 1.0447190779516728, + "learning_rate": 1.9574569678881965e-05, + "loss": 1.0048, + "step": 710 + }, + { + "epoch": 0.12055955913522678, + "grad_norm": 1.0483807763582313, + "learning_rate": 1.9572983289938406e-05, + "loss": 1.0364, + "step": 711 + }, + { + "epoch": 0.12072912250953793, + "grad_norm": 1.0515683963858184, + "learning_rate": 1.9571394013275534e-05, + "loss": 1.085, + "step": 712 + }, + { + "epoch": 0.12089868588384908, + "grad_norm": 1.0619953761084722, + "learning_rate": 1.9569801849372757e-05, + "loss": 1.0674, + "step": 713 + }, + { + "epoch": 0.12106824925816023, + "grad_norm": 1.0105711943887983, + "learning_rate": 1.9568206798710354e-05, + "loss": 1.0215, + "step": 714 + }, + { + "epoch": 0.12123781263247138, + "grad_norm": 1.0981892715403259, + "learning_rate": 1.956660886176948e-05, + "loss": 1.0672, + "step": 715 + }, + { + "epoch": 0.12140737600678253, + "grad_norm": 1.1217723541913862, + "learning_rate": 1.9565008039032158e-05, + "loss": 1.0582, + "step": 716 + }, + { + "epoch": 0.12157693938109368, + "grad_norm": 1.1402128480032945, + "learning_rate": 1.9563404330981276e-05, + "loss": 1.0469, + "step": 717 + }, + { + "epoch": 0.12174650275540483, + "grad_norm": 1.0498357097710411, + "learning_rate": 1.9561797738100602e-05, + "loss": 1.0508, + "step": 718 + }, + { + "epoch": 0.12191606612971598, + "grad_norm": 1.0806630544206426, + "learning_rate": 1.956018826087477e-05, + "loss": 1.0161, + "step": 719 + }, + { + "epoch": 0.12208562950402713, + "grad_norm": 1.0926475072234518, + "learning_rate": 1.9558575899789284e-05, + "loss": 1.03, + "step": 720 + }, + { + "epoch": 0.12225519287833828, + "grad_norm": 1.0522616284189565, + "learning_rate": 1.9556960655330512e-05, + "loss": 1.0778, + "step": 721 + }, + { + "epoch": 0.12242475625264942, + "grad_norm": 1.1103142898686582, + "learning_rate": 1.9555342527985703e-05, + "loss": 1.0836, + "step": 722 + }, + { + "epoch": 0.12259431962696057, + "grad_norm": 1.0670638635131737, + "learning_rate": 1.955372151824297e-05, + "loss": 1.0628, + "step": 723 + }, + { + "epoch": 0.12276388300127172, + "grad_norm": 1.0692612826387502, + "learning_rate": 1.955209762659129e-05, + "loss": 1.0743, + "step": 724 + }, + { + "epoch": 0.12293344637558287, + "grad_norm": 1.1134326946024073, + "learning_rate": 1.955047085352052e-05, + "loss": 1.0914, + "step": 725 + }, + { + "epoch": 0.12310300974989402, + "grad_norm": 1.066456579328852, + "learning_rate": 1.954884119952138e-05, + "loss": 1.0896, + "step": 726 + }, + { + "epoch": 0.12327257312420517, + "grad_norm": 1.0516259676360957, + "learning_rate": 1.954720866508546e-05, + "loss": 1.0571, + "step": 727 + }, + { + "epoch": 0.12344213649851632, + "grad_norm": 1.1091564482378367, + "learning_rate": 1.9545573250705216e-05, + "loss": 1.0591, + "step": 728 + }, + { + "epoch": 0.12361169987282747, + "grad_norm": 1.0736827541771194, + "learning_rate": 1.954393495687398e-05, + "loss": 1.0592, + "step": 729 + }, + { + "epoch": 0.12378126324713862, + "grad_norm": 1.0676995150976312, + "learning_rate": 1.9542293784085943e-05, + "loss": 1.0344, + "step": 730 + }, + { + "epoch": 0.12395082662144977, + "grad_norm": 1.109094421697646, + "learning_rate": 1.9540649732836177e-05, + "loss": 1.036, + "step": 731 + }, + { + "epoch": 0.12412038999576092, + "grad_norm": 1.0992163288953916, + "learning_rate": 1.953900280362061e-05, + "loss": 1.052, + "step": 732 + }, + { + "epoch": 0.12428995337007206, + "grad_norm": 1.1143313658995975, + "learning_rate": 1.9537352996936046e-05, + "loss": 1.0596, + "step": 733 + }, + { + "epoch": 0.12445951674438321, + "grad_norm": 1.1568450359004148, + "learning_rate": 1.953570031328015e-05, + "loss": 1.0661, + "step": 734 + }, + { + "epoch": 0.12462908011869436, + "grad_norm": 1.0609061004604936, + "learning_rate": 1.953404475315146e-05, + "loss": 1.058, + "step": 735 + }, + { + "epoch": 0.12479864349300551, + "grad_norm": 1.0871121305779712, + "learning_rate": 1.9532386317049387e-05, + "loss": 1.0575, + "step": 736 + }, + { + "epoch": 0.12496820686731666, + "grad_norm": 1.1036971008330496, + "learning_rate": 1.9530725005474195e-05, + "loss": 1.0722, + "step": 737 + }, + { + "epoch": 0.1251377702416278, + "grad_norm": 1.0546375145746436, + "learning_rate": 1.9529060818927032e-05, + "loss": 1.0666, + "step": 738 + }, + { + "epoch": 0.12530733361593896, + "grad_norm": 1.155859502035293, + "learning_rate": 1.9527393757909895e-05, + "loss": 1.042, + "step": 739 + }, + { + "epoch": 0.1254768969902501, + "grad_norm": 1.1256224150237286, + "learning_rate": 1.9525723822925662e-05, + "loss": 1.0458, + "step": 740 + }, + { + "epoch": 0.12564646036456126, + "grad_norm": 1.1030769129505762, + "learning_rate": 1.9524051014478078e-05, + "loss": 1.0713, + "step": 741 + }, + { + "epoch": 0.1258160237388724, + "grad_norm": 1.1048862735764426, + "learning_rate": 1.952237533307175e-05, + "loss": 1.029, + "step": 742 + }, + { + "epoch": 0.12598558711318356, + "grad_norm": 1.1749086704818206, + "learning_rate": 1.9520696779212144e-05, + "loss": 1.0751, + "step": 743 + }, + { + "epoch": 0.1261551504874947, + "grad_norm": 1.181040183652613, + "learning_rate": 1.951901535340561e-05, + "loss": 1.0259, + "step": 744 + }, + { + "epoch": 0.12632471386180585, + "grad_norm": 1.0953267398854833, + "learning_rate": 1.9517331056159353e-05, + "loss": 1.0613, + "step": 745 + }, + { + "epoch": 0.126494277236117, + "grad_norm": 1.1008872632118907, + "learning_rate": 1.9515643887981445e-05, + "loss": 1.0352, + "step": 746 + }, + { + "epoch": 0.12666384061042815, + "grad_norm": 1.0745130679535873, + "learning_rate": 1.9513953849380826e-05, + "loss": 1.0484, + "step": 747 + }, + { + "epoch": 0.1268334039847393, + "grad_norm": 1.057377233277173, + "learning_rate": 1.9512260940867298e-05, + "loss": 1.0782, + "step": 748 + }, + { + "epoch": 0.12700296735905045, + "grad_norm": 1.0371938329011086, + "learning_rate": 1.9510565162951538e-05, + "loss": 1.0262, + "step": 749 + }, + { + "epoch": 0.1271725307333616, + "grad_norm": 1.1109505303344525, + "learning_rate": 1.950886651614508e-05, + "loss": 1.0357, + "step": 750 + }, + { + "epoch": 0.12734209410767275, + "grad_norm": 1.0608914607950886, + "learning_rate": 1.950716500096032e-05, + "loss": 1.0517, + "step": 751 + }, + { + "epoch": 0.1275116574819839, + "grad_norm": 1.185057809555846, + "learning_rate": 1.9505460617910537e-05, + "loss": 1.0526, + "step": 752 + }, + { + "epoch": 0.12768122085629505, + "grad_norm": 1.1714793370562924, + "learning_rate": 1.9503753367509855e-05, + "loss": 1.1017, + "step": 753 + }, + { + "epoch": 0.1278507842306062, + "grad_norm": 1.0712385935965227, + "learning_rate": 1.9502043250273274e-05, + "loss": 1.0395, + "step": 754 + }, + { + "epoch": 0.12802034760491734, + "grad_norm": 1.1018897785658721, + "learning_rate": 1.950033026671665e-05, + "loss": 1.0623, + "step": 755 + }, + { + "epoch": 0.1281899109792285, + "grad_norm": 1.0541590259684965, + "learning_rate": 1.949861441735672e-05, + "loss": 1.0639, + "step": 756 + }, + { + "epoch": 0.12835947435353964, + "grad_norm": 1.103876190414786, + "learning_rate": 1.949689570271107e-05, + "loss": 1.053, + "step": 757 + }, + { + "epoch": 0.1285290377278508, + "grad_norm": 1.149601209818205, + "learning_rate": 1.9495174123298156e-05, + "loss": 1.074, + "step": 758 + }, + { + "epoch": 0.12869860110216194, + "grad_norm": 1.1288929168116069, + "learning_rate": 1.949344967963729e-05, + "loss": 1.0311, + "step": 759 + }, + { + "epoch": 0.1288681644764731, + "grad_norm": 1.1542075140935122, + "learning_rate": 1.949172237224867e-05, + "loss": 1.0535, + "step": 760 + }, + { + "epoch": 0.12903772785078424, + "grad_norm": 1.1788709496912684, + "learning_rate": 1.9489992201653337e-05, + "loss": 1.0459, + "step": 761 + }, + { + "epoch": 0.1292072912250954, + "grad_norm": 1.183206255652252, + "learning_rate": 1.9488259168373198e-05, + "loss": 1.0918, + "step": 762 + }, + { + "epoch": 0.12937685459940654, + "grad_norm": 1.0696582001300654, + "learning_rate": 1.948652327293103e-05, + "loss": 1.0519, + "step": 763 + }, + { + "epoch": 0.1295464179737177, + "grad_norm": 1.156945304413662, + "learning_rate": 1.9484784515850474e-05, + "loss": 1.0493, + "step": 764 + }, + { + "epoch": 0.12971598134802884, + "grad_norm": 1.2245080523618486, + "learning_rate": 1.9483042897656032e-05, + "loss": 1.0585, + "step": 765 + }, + { + "epoch": 0.12988554472233999, + "grad_norm": 1.057012084398467, + "learning_rate": 1.9481298418873063e-05, + "loss": 1.0238, + "step": 766 + }, + { + "epoch": 0.13005510809665113, + "grad_norm": 1.050657871575477, + "learning_rate": 1.94795510800278e-05, + "loss": 1.0341, + "step": 767 + }, + { + "epoch": 0.13022467147096228, + "grad_norm": 1.047017410590055, + "learning_rate": 1.9477800881647327e-05, + "loss": 1.0539, + "step": 768 + }, + { + "epoch": 0.13039423484527343, + "grad_norm": 1.065130739659555, + "learning_rate": 1.9476047824259602e-05, + "loss": 1.009, + "step": 769 + }, + { + "epoch": 0.13056379821958458, + "grad_norm": 1.0906992629658234, + "learning_rate": 1.9474291908393437e-05, + "loss": 1.0465, + "step": 770 + }, + { + "epoch": 0.13073336159389573, + "grad_norm": 1.0428024448618274, + "learning_rate": 1.947253313457851e-05, + "loss": 1.0535, + "step": 771 + }, + { + "epoch": 0.13090292496820688, + "grad_norm": 1.0792668868718736, + "learning_rate": 1.947077150334536e-05, + "loss": 1.0771, + "step": 772 + }, + { + "epoch": 0.13107248834251803, + "grad_norm": 1.0482438377755623, + "learning_rate": 1.946900701522539e-05, + "loss": 1.0274, + "step": 773 + }, + { + "epoch": 0.13124205171682918, + "grad_norm": 1.0694689387805867, + "learning_rate": 1.946723967075086e-05, + "loss": 1.0347, + "step": 774 + }, + { + "epoch": 0.13141161509114033, + "grad_norm": 0.708629489669686, + "learning_rate": 1.94654694704549e-05, + "loss": 0.9248, + "step": 775 + }, + { + "epoch": 0.13158117846545148, + "grad_norm": 1.1345356573802166, + "learning_rate": 1.9463696414871493e-05, + "loss": 1.0578, + "step": 776 + }, + { + "epoch": 0.13175074183976263, + "grad_norm": 1.1449463455835753, + "learning_rate": 1.946192050453549e-05, + "loss": 1.0535, + "step": 777 + }, + { + "epoch": 0.13192030521407375, + "grad_norm": 1.164004341203831, + "learning_rate": 1.946014173998259e-05, + "loss": 1.1073, + "step": 778 + }, + { + "epoch": 0.1320898685883849, + "grad_norm": 1.0227968562490022, + "learning_rate": 1.9458360121749372e-05, + "loss": 1.0577, + "step": 779 + }, + { + "epoch": 0.13225943196269604, + "grad_norm": 1.066166458355352, + "learning_rate": 1.9456575650373267e-05, + "loss": 1.0674, + "step": 780 + }, + { + "epoch": 0.1324289953370072, + "grad_norm": 1.099183425357359, + "learning_rate": 1.9454788326392558e-05, + "loss": 1.0927, + "step": 781 + }, + { + "epoch": 0.13259855871131834, + "grad_norm": 1.1327547948403307, + "learning_rate": 1.9452998150346403e-05, + "loss": 0.9931, + "step": 782 + }, + { + "epoch": 0.1327681220856295, + "grad_norm": 1.0818730994846633, + "learning_rate": 1.9451205122774815e-05, + "loss": 1.0927, + "step": 783 + }, + { + "epoch": 0.13293768545994064, + "grad_norm": 1.1058974106504893, + "learning_rate": 1.9449409244218662e-05, + "loss": 1.1077, + "step": 784 + }, + { + "epoch": 0.1331072488342518, + "grad_norm": 1.0556971309698218, + "learning_rate": 1.944761051521968e-05, + "loss": 1.0346, + "step": 785 + }, + { + "epoch": 0.13327681220856294, + "grad_norm": 1.1563290924721255, + "learning_rate": 1.9445808936320457e-05, + "loss": 1.1217, + "step": 786 + }, + { + "epoch": 0.1334463755828741, + "grad_norm": 1.0528888161777412, + "learning_rate": 1.9444004508064446e-05, + "loss": 1.0575, + "step": 787 + }, + { + "epoch": 0.13361593895718524, + "grad_norm": 0.9965187860897641, + "learning_rate": 1.944219723099596e-05, + "loss": 1.012, + "step": 788 + }, + { + "epoch": 0.1337855023314964, + "grad_norm": 1.0152070539684257, + "learning_rate": 1.944038710566017e-05, + "loss": 1.0396, + "step": 789 + }, + { + "epoch": 0.13395506570580754, + "grad_norm": 1.0311329132119162, + "learning_rate": 1.9438574132603106e-05, + "loss": 1.0219, + "step": 790 + }, + { + "epoch": 0.13412462908011868, + "grad_norm": 1.0310376029257697, + "learning_rate": 1.943675831237165e-05, + "loss": 1.0241, + "step": 791 + }, + { + "epoch": 0.13429419245442983, + "grad_norm": 1.02721440556268, + "learning_rate": 1.9434939645513556e-05, + "loss": 1.0143, + "step": 792 + }, + { + "epoch": 0.13446375582874098, + "grad_norm": 1.0330958900978466, + "learning_rate": 1.9433118132577432e-05, + "loss": 1.0595, + "step": 793 + }, + { + "epoch": 0.13463331920305213, + "grad_norm": 1.0765605631263437, + "learning_rate": 1.9431293774112737e-05, + "loss": 1.0576, + "step": 794 + }, + { + "epoch": 0.13480288257736328, + "grad_norm": 1.0281136330649452, + "learning_rate": 1.94294665706698e-05, + "loss": 1.0162, + "step": 795 + }, + { + "epoch": 0.13497244595167443, + "grad_norm": 1.0726772948352115, + "learning_rate": 1.94276365227998e-05, + "loss": 1.0614, + "step": 796 + }, + { + "epoch": 0.13514200932598558, + "grad_norm": 1.0814789889013592, + "learning_rate": 1.9425803631054773e-05, + "loss": 1.0749, + "step": 797 + }, + { + "epoch": 0.13531157270029673, + "grad_norm": 1.1647803406374613, + "learning_rate": 1.9423967895987625e-05, + "loss": 1.0062, + "step": 798 + }, + { + "epoch": 0.13548113607460788, + "grad_norm": 1.078971477580237, + "learning_rate": 1.94221293181521e-05, + "loss": 1.038, + "step": 799 + }, + { + "epoch": 0.13565069944891903, + "grad_norm": 1.0528174425184071, + "learning_rate": 1.942028789810282e-05, + "loss": 1.068, + "step": 800 + }, + { + "epoch": 0.13582026282323018, + "grad_norm": 1.1118827397760795, + "learning_rate": 1.941844363639525e-05, + "loss": 1.0472, + "step": 801 + }, + { + "epoch": 0.13598982619754132, + "grad_norm": 1.0636213590805914, + "learning_rate": 1.9416596533585717e-05, + "loss": 1.0542, + "step": 802 + }, + { + "epoch": 0.13615938957185247, + "grad_norm": 1.1154868381971752, + "learning_rate": 1.9414746590231407e-05, + "loss": 1.074, + "step": 803 + }, + { + "epoch": 0.13632895294616362, + "grad_norm": 1.0648594969353748, + "learning_rate": 1.9412893806890358e-05, + "loss": 1.0796, + "step": 804 + }, + { + "epoch": 0.13649851632047477, + "grad_norm": 1.004976249971096, + "learning_rate": 1.941103818412147e-05, + "loss": 1.0544, + "step": 805 + }, + { + "epoch": 0.13666807969478592, + "grad_norm": 1.0367836077040782, + "learning_rate": 1.94091797224845e-05, + "loss": 1.0323, + "step": 806 + }, + { + "epoch": 0.13683764306909707, + "grad_norm": 1.0262546052925723, + "learning_rate": 1.9407318422540057e-05, + "loss": 0.9813, + "step": 807 + }, + { + "epoch": 0.13700720644340822, + "grad_norm": 1.0475035535802888, + "learning_rate": 1.9405454284849604e-05, + "loss": 1.0546, + "step": 808 + }, + { + "epoch": 0.13717676981771937, + "grad_norm": 1.0903173604692298, + "learning_rate": 1.9403587309975467e-05, + "loss": 1.0455, + "step": 809 + }, + { + "epoch": 0.13734633319203052, + "grad_norm": 1.0426227028523483, + "learning_rate": 1.9401717498480825e-05, + "loss": 1.0694, + "step": 810 + }, + { + "epoch": 0.13751589656634167, + "grad_norm": 1.0346758417438304, + "learning_rate": 1.939984485092971e-05, + "loss": 1.0504, + "step": 811 + }, + { + "epoch": 0.13768545994065282, + "grad_norm": 1.1103498381770676, + "learning_rate": 1.9397969367887014e-05, + "loss": 1.0392, + "step": 812 + }, + { + "epoch": 0.13785502331496396, + "grad_norm": 1.0873796773509765, + "learning_rate": 1.9396091049918478e-05, + "loss": 1.0285, + "step": 813 + }, + { + "epoch": 0.1380245866892751, + "grad_norm": 1.0733102934679524, + "learning_rate": 1.9394209897590707e-05, + "loss": 1.0791, + "step": 814 + }, + { + "epoch": 0.13819415006358626, + "grad_norm": 1.124458655592655, + "learning_rate": 1.9392325911471154e-05, + "loss": 1.0732, + "step": 815 + }, + { + "epoch": 0.1383637134378974, + "grad_norm": 1.1732003630116532, + "learning_rate": 1.939043909212813e-05, + "loss": 1.0881, + "step": 816 + }, + { + "epoch": 0.13853327681220856, + "grad_norm": 1.1864935574802329, + "learning_rate": 1.93885494401308e-05, + "loss": 1.0796, + "step": 817 + }, + { + "epoch": 0.1387028401865197, + "grad_norm": 1.08967866372316, + "learning_rate": 1.9386656956049182e-05, + "loss": 1.0818, + "step": 818 + }, + { + "epoch": 0.13887240356083086, + "grad_norm": 1.1246588499786248, + "learning_rate": 1.9384761640454152e-05, + "loss": 1.0862, + "step": 819 + }, + { + "epoch": 0.139041966935142, + "grad_norm": 1.084218258164674, + "learning_rate": 1.9382863493917433e-05, + "loss": 1.0448, + "step": 820 + }, + { + "epoch": 0.13921153030945316, + "grad_norm": 1.104604455415271, + "learning_rate": 1.9380962517011614e-05, + "loss": 1.0687, + "step": 821 + }, + { + "epoch": 0.1393810936837643, + "grad_norm": 1.028278427220313, + "learning_rate": 1.9379058710310124e-05, + "loss": 1.016, + "step": 822 + }, + { + "epoch": 0.13955065705807546, + "grad_norm": 1.077884981811719, + "learning_rate": 1.9377152074387253e-05, + "loss": 1.0515, + "step": 823 + }, + { + "epoch": 0.1397202204323866, + "grad_norm": 1.185172547417553, + "learning_rate": 1.9375242609818144e-05, + "loss": 1.0673, + "step": 824 + }, + { + "epoch": 0.13988978380669775, + "grad_norm": 1.082380869634408, + "learning_rate": 1.9373330317178797e-05, + "loss": 1.0594, + "step": 825 + }, + { + "epoch": 0.1400593471810089, + "grad_norm": 1.1200443818381345, + "learning_rate": 1.9371415197046054e-05, + "loss": 1.0488, + "step": 826 + }, + { + "epoch": 0.14022891055532005, + "grad_norm": 1.1395052604783387, + "learning_rate": 1.936949724999762e-05, + "loss": 1.0788, + "step": 827 + }, + { + "epoch": 0.1403984739296312, + "grad_norm": 1.1674257978885036, + "learning_rate": 1.9367576476612048e-05, + "loss": 1.0628, + "step": 828 + }, + { + "epoch": 0.14056803730394235, + "grad_norm": 1.0619372371369435, + "learning_rate": 1.9365652877468747e-05, + "loss": 1.0531, + "step": 829 + }, + { + "epoch": 0.1407376006782535, + "grad_norm": 1.0434936336092933, + "learning_rate": 1.936372645314798e-05, + "loss": 1.0164, + "step": 830 + }, + { + "epoch": 0.14090716405256465, + "grad_norm": 1.1004725737648793, + "learning_rate": 1.936179720423085e-05, + "loss": 1.0851, + "step": 831 + }, + { + "epoch": 0.1410767274268758, + "grad_norm": 1.0589604061310893, + "learning_rate": 1.9359865131299328e-05, + "loss": 1.0321, + "step": 832 + }, + { + "epoch": 0.14124629080118695, + "grad_norm": 1.0475759575945085, + "learning_rate": 1.9357930234936228e-05, + "loss": 1.0517, + "step": 833 + }, + { + "epoch": 0.1414158541754981, + "grad_norm": 1.045474612726252, + "learning_rate": 1.9355992515725216e-05, + "loss": 1.0448, + "step": 834 + }, + { + "epoch": 0.14158541754980924, + "grad_norm": 0.7745132914901004, + "learning_rate": 1.935405197425081e-05, + "loss": 0.8728, + "step": 835 + }, + { + "epoch": 0.1417549809241204, + "grad_norm": 1.093659455283772, + "learning_rate": 1.935210861109838e-05, + "loss": 1.0387, + "step": 836 + }, + { + "epoch": 0.14192454429843154, + "grad_norm": 1.1527962599465735, + "learning_rate": 1.9350162426854152e-05, + "loss": 1.0857, + "step": 837 + }, + { + "epoch": 0.1420941076727427, + "grad_norm": 1.0862456354506587, + "learning_rate": 1.9348213422105192e-05, + "loss": 1.0166, + "step": 838 + }, + { + "epoch": 0.14226367104705384, + "grad_norm": 1.128346863922026, + "learning_rate": 1.9346261597439427e-05, + "loss": 1.0729, + "step": 839 + }, + { + "epoch": 0.142433234421365, + "grad_norm": 1.1093450255237425, + "learning_rate": 1.9344306953445632e-05, + "loss": 1.0602, + "step": 840 + }, + { + "epoch": 0.14260279779567614, + "grad_norm": 0.7005186504674276, + "learning_rate": 1.9342349490713427e-05, + "loss": 0.8628, + "step": 841 + }, + { + "epoch": 0.1427723611699873, + "grad_norm": 1.048667347671691, + "learning_rate": 1.934038920983329e-05, + "loss": 1.0228, + "step": 842 + }, + { + "epoch": 0.14294192454429844, + "grad_norm": 1.1304776145369564, + "learning_rate": 1.9338426111396548e-05, + "loss": 1.0898, + "step": 843 + }, + { + "epoch": 0.1431114879186096, + "grad_norm": 1.1004750119851474, + "learning_rate": 1.9336460195995368e-05, + "loss": 1.0672, + "step": 844 + }, + { + "epoch": 0.14328105129292074, + "grad_norm": 1.1168790606557641, + "learning_rate": 1.933449146422278e-05, + "loss": 1.0568, + "step": 845 + }, + { + "epoch": 0.14345061466723188, + "grad_norm": 1.0614059811728171, + "learning_rate": 1.9332519916672656e-05, + "loss": 0.9996, + "step": 846 + }, + { + "epoch": 0.14362017804154303, + "grad_norm": 1.0596385562077324, + "learning_rate": 1.933054555393972e-05, + "loss": 1.0376, + "step": 847 + }, + { + "epoch": 0.14378974141585418, + "grad_norm": 1.0533999496487665, + "learning_rate": 1.932856837661954e-05, + "loss": 1.0727, + "step": 848 + }, + { + "epoch": 0.14395930479016533, + "grad_norm": 1.0231359277525693, + "learning_rate": 1.932658838530855e-05, + "loss": 1.0531, + "step": 849 + }, + { + "epoch": 0.14412886816447648, + "grad_norm": 1.0567186211411237, + "learning_rate": 1.9324605580604007e-05, + "loss": 1.0606, + "step": 850 + }, + { + "epoch": 0.14429843153878763, + "grad_norm": 1.0960097103405915, + "learning_rate": 1.9322619963104036e-05, + "loss": 1.0726, + "step": 851 + }, + { + "epoch": 0.14446799491309878, + "grad_norm": 0.9766166614205893, + "learning_rate": 1.93206315334076e-05, + "loss": 1.0158, + "step": 852 + }, + { + "epoch": 0.14463755828740993, + "grad_norm": 1.0086404438131777, + "learning_rate": 1.9318640292114526e-05, + "loss": 1.0015, + "step": 853 + }, + { + "epoch": 0.14480712166172108, + "grad_norm": 1.074452694617385, + "learning_rate": 1.9316646239825466e-05, + "loss": 1.0602, + "step": 854 + }, + { + "epoch": 0.14497668503603223, + "grad_norm": 1.0618087044378768, + "learning_rate": 1.9314649377141935e-05, + "loss": 1.0495, + "step": 855 + }, + { + "epoch": 0.14514624841034338, + "grad_norm": 0.7219827404997976, + "learning_rate": 1.9312649704666295e-05, + "loss": 0.8791, + "step": 856 + }, + { + "epoch": 0.14531581178465452, + "grad_norm": 1.1230848382726437, + "learning_rate": 1.9310647223001752e-05, + "loss": 1.053, + "step": 857 + }, + { + "epoch": 0.14548537515896567, + "grad_norm": 1.1494633063944097, + "learning_rate": 1.9308641932752362e-05, + "loss": 1.0129, + "step": 858 + }, + { + "epoch": 0.14565493853327682, + "grad_norm": 1.0218116847276328, + "learning_rate": 1.9306633834523022e-05, + "loss": 1.0269, + "step": 859 + }, + { + "epoch": 0.14582450190758797, + "grad_norm": 1.0961351371659203, + "learning_rate": 1.9304622928919486e-05, + "loss": 1.0127, + "step": 860 + }, + { + "epoch": 0.14599406528189912, + "grad_norm": 1.141579446852389, + "learning_rate": 1.9302609216548352e-05, + "loss": 1.0285, + "step": 861 + }, + { + "epoch": 0.14616362865621027, + "grad_norm": 1.0793211648155234, + "learning_rate": 1.9300592698017054e-05, + "loss": 1.0495, + "step": 862 + }, + { + "epoch": 0.14633319203052142, + "grad_norm": 1.1421417009184929, + "learning_rate": 1.929857337393389e-05, + "loss": 1.0903, + "step": 863 + }, + { + "epoch": 0.14650275540483257, + "grad_norm": 1.110017135756902, + "learning_rate": 1.9296551244907986e-05, + "loss": 1.0898, + "step": 864 + }, + { + "epoch": 0.14667231877914372, + "grad_norm": 1.0600519730432707, + "learning_rate": 1.929452631154933e-05, + "loss": 1.0365, + "step": 865 + }, + { + "epoch": 0.14684188215345487, + "grad_norm": 1.1028595800699648, + "learning_rate": 1.929249857446875e-05, + "loss": 1.0861, + "step": 866 + }, + { + "epoch": 0.14701144552776602, + "grad_norm": 1.0574596805822303, + "learning_rate": 1.9290468034277912e-05, + "loss": 1.0133, + "step": 867 + }, + { + "epoch": 0.14718100890207717, + "grad_norm": 1.054189464636296, + "learning_rate": 1.9288434691589343e-05, + "loss": 1.0333, + "step": 868 + }, + { + "epoch": 0.1473505722763883, + "grad_norm": 1.0141671044330676, + "learning_rate": 1.9286398547016398e-05, + "loss": 1.0779, + "step": 869 + }, + { + "epoch": 0.14752013565069944, + "grad_norm": 1.064214895313415, + "learning_rate": 1.9284359601173295e-05, + "loss": 1.0211, + "step": 870 + }, + { + "epoch": 0.14768969902501058, + "grad_norm": 0.7309553171238877, + "learning_rate": 1.928231785467508e-05, + "loss": 0.7908, + "step": 871 + }, + { + "epoch": 0.14785926239932173, + "grad_norm": 1.09157235299895, + "learning_rate": 1.9280273308137662e-05, + "loss": 1.0596, + "step": 872 + }, + { + "epoch": 0.14802882577363288, + "grad_norm": 1.0905388089422527, + "learning_rate": 1.9278225962177776e-05, + "loss": 1.079, + "step": 873 + }, + { + "epoch": 0.14819838914794403, + "grad_norm": 1.0773539631614544, + "learning_rate": 1.9276175817413013e-05, + "loss": 1.0536, + "step": 874 + }, + { + "epoch": 0.14836795252225518, + "grad_norm": 1.0712206356208391, + "learning_rate": 1.9274122874461808e-05, + "loss": 1.0556, + "step": 875 + }, + { + "epoch": 0.14853751589656633, + "grad_norm": 1.0417419947535282, + "learning_rate": 1.9272067133943432e-05, + "loss": 1.0004, + "step": 876 + }, + { + "epoch": 0.14870707927087748, + "grad_norm": 1.0486476963464526, + "learning_rate": 1.9270008596478008e-05, + "loss": 1.0165, + "step": 877 + }, + { + "epoch": 0.14887664264518863, + "grad_norm": 0.6863337643252526, + "learning_rate": 1.92679472626865e-05, + "loss": 0.8939, + "step": 878 + }, + { + "epoch": 0.14904620601949978, + "grad_norm": 1.0583938505102617, + "learning_rate": 1.9265883133190715e-05, + "loss": 1.0526, + "step": 879 + }, + { + "epoch": 0.14921576939381093, + "grad_norm": 1.1128101186298782, + "learning_rate": 1.9263816208613306e-05, + "loss": 1.0686, + "step": 880 + }, + { + "epoch": 0.14938533276812208, + "grad_norm": 1.0404999748912096, + "learning_rate": 1.9261746489577767e-05, + "loss": 1.0311, + "step": 881 + }, + { + "epoch": 0.14955489614243322, + "grad_norm": 1.125837888018767, + "learning_rate": 1.925967397670843e-05, + "loss": 1.095, + "step": 882 + }, + { + "epoch": 0.14972445951674437, + "grad_norm": 1.0011525930771923, + "learning_rate": 1.9257598670630484e-05, + "loss": 1.038, + "step": 883 + }, + { + "epoch": 0.14989402289105552, + "grad_norm": 1.0791155860988666, + "learning_rate": 1.925552057196994e-05, + "loss": 1.0327, + "step": 884 + }, + { + "epoch": 0.15006358626536667, + "grad_norm": 1.0920643778551258, + "learning_rate": 1.9253439681353673e-05, + "loss": 1.0643, + "step": 885 + }, + { + "epoch": 0.15023314963967782, + "grad_norm": 1.0842220737705153, + "learning_rate": 1.925135599940938e-05, + "loss": 1.0686, + "step": 886 + }, + { + "epoch": 0.15040271301398897, + "grad_norm": 1.0443544799260835, + "learning_rate": 1.924926952676562e-05, + "loss": 1.1122, + "step": 887 + }, + { + "epoch": 0.15057227638830012, + "grad_norm": 1.067319759526223, + "learning_rate": 1.9247180264051777e-05, + "loss": 1.0375, + "step": 888 + }, + { + "epoch": 0.15074183976261127, + "grad_norm": 0.6462489530664455, + "learning_rate": 1.9245088211898086e-05, + "loss": 0.8483, + "step": 889 + }, + { + "epoch": 0.15091140313692242, + "grad_norm": 1.1231491563255163, + "learning_rate": 1.9242993370935622e-05, + "loss": 1.0804, + "step": 890 + }, + { + "epoch": 0.15108096651123357, + "grad_norm": 1.0525430337391257, + "learning_rate": 1.9240895741796297e-05, + "loss": 1.0425, + "step": 891 + }, + { + "epoch": 0.15125052988554472, + "grad_norm": 1.0477757204520055, + "learning_rate": 1.9238795325112867e-05, + "loss": 1.0448, + "step": 892 + }, + { + "epoch": 0.15142009325985586, + "grad_norm": 1.0916191614000863, + "learning_rate": 1.9236692121518934e-05, + "loss": 1.0289, + "step": 893 + }, + { + "epoch": 0.151589656634167, + "grad_norm": 1.1102801951240548, + "learning_rate": 1.9234586131648933e-05, + "loss": 1.0631, + "step": 894 + }, + { + "epoch": 0.15175922000847816, + "grad_norm": 1.1220606572188465, + "learning_rate": 1.923247735613814e-05, + "loss": 1.0573, + "step": 895 + }, + { + "epoch": 0.1519287833827893, + "grad_norm": 0.9998438621890287, + "learning_rate": 1.9230365795622675e-05, + "loss": 1.0113, + "step": 896 + }, + { + "epoch": 0.15209834675710046, + "grad_norm": 1.0752354115172118, + "learning_rate": 1.9228251450739495e-05, + "loss": 1.0174, + "step": 897 + }, + { + "epoch": 0.1522679101314116, + "grad_norm": 1.1136183818569536, + "learning_rate": 1.92261343221264e-05, + "loss": 1.0307, + "step": 898 + }, + { + "epoch": 0.15243747350572276, + "grad_norm": 1.041094664900339, + "learning_rate": 1.922401441042203e-05, + "loss": 1.0793, + "step": 899 + }, + { + "epoch": 0.1526070368800339, + "grad_norm": 1.0273988853337395, + "learning_rate": 1.9221891716265865e-05, + "loss": 1.0478, + "step": 900 + }, + { + "epoch": 0.15277660025434506, + "grad_norm": 1.0150885591380276, + "learning_rate": 1.921976624029821e-05, + "loss": 1.0542, + "step": 901 + }, + { + "epoch": 0.1529461636286562, + "grad_norm": 1.1072883536606641, + "learning_rate": 1.9217637983160234e-05, + "loss": 1.0541, + "step": 902 + }, + { + "epoch": 0.15311572700296736, + "grad_norm": 1.1024232287688263, + "learning_rate": 1.9215506945493933e-05, + "loss": 1.0279, + "step": 903 + }, + { + "epoch": 0.1532852903772785, + "grad_norm": 1.1046565029368205, + "learning_rate": 1.921337312794213e-05, + "loss": 1.0603, + "step": 904 + }, + { + "epoch": 0.15345485375158965, + "grad_norm": 1.1020144538617223, + "learning_rate": 1.92112365311485e-05, + "loss": 1.0668, + "step": 905 + }, + { + "epoch": 0.1536244171259008, + "grad_norm": 0.6306196845086575, + "learning_rate": 1.9209097155757562e-05, + "loss": 0.8127, + "step": 906 + }, + { + "epoch": 0.15379398050021195, + "grad_norm": 1.1222438078603456, + "learning_rate": 1.9206955002414662e-05, + "loss": 1.0866, + "step": 907 + }, + { + "epoch": 0.1539635438745231, + "grad_norm": 1.080412892285181, + "learning_rate": 1.920481007176598e-05, + "loss": 1.0516, + "step": 908 + }, + { + "epoch": 0.15413310724883425, + "grad_norm": 1.1080426198720974, + "learning_rate": 1.920266236445855e-05, + "loss": 1.0851, + "step": 909 + }, + { + "epoch": 0.1543026706231454, + "grad_norm": 1.0715007809411334, + "learning_rate": 1.920051188114023e-05, + "loss": 1.0426, + "step": 910 + }, + { + "epoch": 0.15447223399745655, + "grad_norm": 1.0474262655109656, + "learning_rate": 1.919835862245972e-05, + "loss": 1.0341, + "step": 911 + }, + { + "epoch": 0.1546417973717677, + "grad_norm": 1.0756568049177015, + "learning_rate": 1.9196202589066556e-05, + "loss": 1.0475, + "step": 912 + }, + { + "epoch": 0.15481136074607885, + "grad_norm": 1.043877036715147, + "learning_rate": 1.919404378161111e-05, + "loss": 1.034, + "step": 913 + }, + { + "epoch": 0.15498092412039, + "grad_norm": 1.0545499129547016, + "learning_rate": 1.9191882200744602e-05, + "loss": 1.0603, + "step": 914 + }, + { + "epoch": 0.15515048749470114, + "grad_norm": 1.1419323750225032, + "learning_rate": 1.918971784711907e-05, + "loss": 1.0524, + "step": 915 + }, + { + "epoch": 0.1553200508690123, + "grad_norm": 1.0848607341272305, + "learning_rate": 1.91875507213874e-05, + "loss": 1.0129, + "step": 916 + }, + { + "epoch": 0.15548961424332344, + "grad_norm": 1.0304906798587208, + "learning_rate": 1.918538082420332e-05, + "loss": 1.0605, + "step": 917 + }, + { + "epoch": 0.1556591776176346, + "grad_norm": 1.0863766933224028, + "learning_rate": 1.918320815622137e-05, + "loss": 1.0481, + "step": 918 + }, + { + "epoch": 0.15582874099194574, + "grad_norm": 1.0710321794041036, + "learning_rate": 1.9181032718096957e-05, + "loss": 1.0496, + "step": 919 + }, + { + "epoch": 0.1559983043662569, + "grad_norm": 1.0297742342897147, + "learning_rate": 1.9178854510486298e-05, + "loss": 0.9914, + "step": 920 + }, + { + "epoch": 0.15616786774056804, + "grad_norm": 1.0732974568090625, + "learning_rate": 1.9176673534046465e-05, + "loss": 1.013, + "step": 921 + }, + { + "epoch": 0.1563374311148792, + "grad_norm": 1.088103948993506, + "learning_rate": 1.9174489789435348e-05, + "loss": 1.0226, + "step": 922 + }, + { + "epoch": 0.15650699448919034, + "grad_norm": 1.1119656602506007, + "learning_rate": 1.9172303277311686e-05, + "loss": 1.0285, + "step": 923 + }, + { + "epoch": 0.1566765578635015, + "grad_norm": 1.1030374236288156, + "learning_rate": 1.917011399833504e-05, + "loss": 1.0452, + "step": 924 + }, + { + "epoch": 0.15684612123781264, + "grad_norm": 0.6482444069676975, + "learning_rate": 1.9167921953165827e-05, + "loss": 0.8249, + "step": 925 + }, + { + "epoch": 0.15701568461212378, + "grad_norm": 1.1033478237135148, + "learning_rate": 1.9165727142465266e-05, + "loss": 1.0043, + "step": 926 + }, + { + "epoch": 0.15718524798643493, + "grad_norm": 1.1299739623715552, + "learning_rate": 1.916352956689544e-05, + "loss": 1.027, + "step": 927 + }, + { + "epoch": 0.15735481136074608, + "grad_norm": 1.0584926185750587, + "learning_rate": 1.916132922711925e-05, + "loss": 1.022, + "step": 928 + }, + { + "epoch": 0.15752437473505723, + "grad_norm": 1.056992187046396, + "learning_rate": 1.9159126123800437e-05, + "loss": 1.0276, + "step": 929 + }, + { + "epoch": 0.15769393810936838, + "grad_norm": 1.0411380281761882, + "learning_rate": 1.915692025760357e-05, + "loss": 1.0546, + "step": 930 + }, + { + "epoch": 0.15786350148367953, + "grad_norm": 1.0682731745814429, + "learning_rate": 1.9154711629194062e-05, + "loss": 1.054, + "step": 931 + }, + { + "epoch": 0.15803306485799068, + "grad_norm": 1.1561727435104403, + "learning_rate": 1.9152500239238144e-05, + "loss": 1.0455, + "step": 932 + }, + { + "epoch": 0.15820262823230183, + "grad_norm": 1.1312827819686595, + "learning_rate": 1.9150286088402898e-05, + "loss": 1.104, + "step": 933 + }, + { + "epoch": 0.15837219160661298, + "grad_norm": 1.0695744152908222, + "learning_rate": 1.9148069177356223e-05, + "loss": 1.0678, + "step": 934 + }, + { + "epoch": 0.15854175498092413, + "grad_norm": 1.0471665509140067, + "learning_rate": 1.9145849506766856e-05, + "loss": 1.018, + "step": 935 + }, + { + "epoch": 0.15871131835523528, + "grad_norm": 1.09813822242061, + "learning_rate": 1.914362707730437e-05, + "loss": 1.0761, + "step": 936 + }, + { + "epoch": 0.15888088172954642, + "grad_norm": 1.0772744799025893, + "learning_rate": 1.9141401889639167e-05, + "loss": 1.0782, + "step": 937 + }, + { + "epoch": 0.15905044510385757, + "grad_norm": 1.0742866315924127, + "learning_rate": 1.9139173944442482e-05, + "loss": 1.0427, + "step": 938 + }, + { + "epoch": 0.15922000847816872, + "grad_norm": 1.1626263938467596, + "learning_rate": 1.913694324238638e-05, + "loss": 1.0464, + "step": 939 + }, + { + "epoch": 0.15938957185247987, + "grad_norm": 0.6635534612233993, + "learning_rate": 1.9134709784143763e-05, + "loss": 0.8521, + "step": 940 + }, + { + "epoch": 0.15955913522679102, + "grad_norm": 1.0890073638226203, + "learning_rate": 1.9132473570388354e-05, + "loss": 1.0568, + "step": 941 + }, + { + "epoch": 0.15972869860110217, + "grad_norm": 1.0754807621640843, + "learning_rate": 1.913023460179472e-05, + "loss": 1.041, + "step": 942 + }, + { + "epoch": 0.15989826197541332, + "grad_norm": 1.1413046230610473, + "learning_rate": 1.9127992879038245e-05, + "loss": 1.0915, + "step": 943 + }, + { + "epoch": 0.16006782534972447, + "grad_norm": 1.0200204711647267, + "learning_rate": 1.912574840279516e-05, + "loss": 1.0221, + "step": 944 + }, + { + "epoch": 0.16023738872403562, + "grad_norm": 1.0199121463513054, + "learning_rate": 1.9123501173742514e-05, + "loss": 1.0164, + "step": 945 + }, + { + "epoch": 0.16040695209834677, + "grad_norm": 1.0552427764536851, + "learning_rate": 1.912125119255819e-05, + "loss": 1.0639, + "step": 946 + }, + { + "epoch": 0.16057651547265792, + "grad_norm": 1.0924354596458121, + "learning_rate": 1.91189984599209e-05, + "loss": 1.0964, + "step": 947 + }, + { + "epoch": 0.16074607884696906, + "grad_norm": 1.1082698011187466, + "learning_rate": 1.9116742976510195e-05, + "loss": 1.061, + "step": 948 + }, + { + "epoch": 0.16091564222128021, + "grad_norm": 1.087075860841995, + "learning_rate": 1.911448474300644e-05, + "loss": 1.0875, + "step": 949 + }, + { + "epoch": 0.16108520559559136, + "grad_norm": 1.1090689673687966, + "learning_rate": 1.911222376009084e-05, + "loss": 1.0446, + "step": 950 + }, + { + "epoch": 0.1612547689699025, + "grad_norm": 1.116789238696624, + "learning_rate": 1.910996002844543e-05, + "loss": 1.0076, + "step": 951 + }, + { + "epoch": 0.16142433234421366, + "grad_norm": 1.0527060430720205, + "learning_rate": 1.910769354875307e-05, + "loss": 1.0154, + "step": 952 + }, + { + "epoch": 0.1615938957185248, + "grad_norm": 1.0856744510099625, + "learning_rate": 1.910542432169745e-05, + "loss": 1.0301, + "step": 953 + }, + { + "epoch": 0.16176345909283596, + "grad_norm": 1.0418508889820652, + "learning_rate": 1.910315234796309e-05, + "loss": 1.0508, + "step": 954 + }, + { + "epoch": 0.1619330224671471, + "grad_norm": 1.2147413236375053, + "learning_rate": 1.9100877628235337e-05, + "loss": 1.0464, + "step": 955 + }, + { + "epoch": 0.16210258584145826, + "grad_norm": 1.033459857420962, + "learning_rate": 1.9098600163200366e-05, + "loss": 1.0407, + "step": 956 + }, + { + "epoch": 0.1622721492157694, + "grad_norm": 1.1262256829727249, + "learning_rate": 1.9096319953545186e-05, + "loss": 1.0507, + "step": 957 + }, + { + "epoch": 0.16244171259008056, + "grad_norm": 1.0898486768419438, + "learning_rate": 1.9094036999957623e-05, + "loss": 1.0211, + "step": 958 + }, + { + "epoch": 0.1626112759643917, + "grad_norm": 1.1217666553293602, + "learning_rate": 1.909175130312634e-05, + "loss": 1.0563, + "step": 959 + }, + { + "epoch": 0.16278083933870285, + "grad_norm": 1.021613343893737, + "learning_rate": 1.9089462863740825e-05, + "loss": 1.0407, + "step": 960 + }, + { + "epoch": 0.16295040271301398, + "grad_norm": 1.0492735332740109, + "learning_rate": 1.908717168249139e-05, + "loss": 1.0045, + "step": 961 + }, + { + "epoch": 0.16311996608732512, + "grad_norm": 1.0919994505596375, + "learning_rate": 1.908487776006918e-05, + "loss": 1.0238, + "step": 962 + }, + { + "epoch": 0.16328952946163627, + "grad_norm": 1.0604412964637524, + "learning_rate": 1.908258109716616e-05, + "loss": 1.069, + "step": 963 + }, + { + "epoch": 0.16345909283594742, + "grad_norm": 0.7881503841452441, + "learning_rate": 1.908028169447513e-05, + "loss": 0.8649, + "step": 964 + }, + { + "epoch": 0.16362865621025857, + "grad_norm": 1.0749029440794484, + "learning_rate": 1.9077979552689708e-05, + "loss": 1.069, + "step": 965 + }, + { + "epoch": 0.16379821958456972, + "grad_norm": 1.1538941115995935, + "learning_rate": 1.907567467250434e-05, + "loss": 1.0753, + "step": 966 + }, + { + "epoch": 0.16396778295888087, + "grad_norm": 1.1153404163764684, + "learning_rate": 1.907336705461431e-05, + "loss": 1.0278, + "step": 967 + }, + { + "epoch": 0.16413734633319202, + "grad_norm": 1.0097503219960056, + "learning_rate": 1.9071056699715704e-05, + "loss": 1.0315, + "step": 968 + }, + { + "epoch": 0.16430690970750317, + "grad_norm": 1.0534508324509975, + "learning_rate": 1.9068743608505454e-05, + "loss": 1.0693, + "step": 969 + }, + { + "epoch": 0.16447647308181432, + "grad_norm": 1.061456522681583, + "learning_rate": 1.9066427781681314e-05, + "loss": 0.9942, + "step": 970 + }, + { + "epoch": 0.16464603645612547, + "grad_norm": 1.0270933639792335, + "learning_rate": 1.9064109219941863e-05, + "loss": 1.0459, + "step": 971 + }, + { + "epoch": 0.16481559983043662, + "grad_norm": 1.0180798327303335, + "learning_rate": 1.906178792398649e-05, + "loss": 0.9937, + "step": 972 + }, + { + "epoch": 0.16498516320474776, + "grad_norm": 1.0375579307790832, + "learning_rate": 1.9059463894515427e-05, + "loss": 1.0506, + "step": 973 + }, + { + "epoch": 0.1651547265790589, + "grad_norm": 1.0909691437736226, + "learning_rate": 1.905713713222973e-05, + "loss": 1.0945, + "step": 974 + }, + { + "epoch": 0.16532428995337006, + "grad_norm": 1.0991256665699363, + "learning_rate": 1.9054807637831268e-05, + "loss": 1.035, + "step": 975 + }, + { + "epoch": 0.1654938533276812, + "grad_norm": 1.0670928903741004, + "learning_rate": 1.905247541202274e-05, + "loss": 1.0237, + "step": 976 + }, + { + "epoch": 0.16566341670199236, + "grad_norm": 1.112108157502983, + "learning_rate": 1.905014045550767e-05, + "loss": 1.0695, + "step": 977 + }, + { + "epoch": 0.1658329800763035, + "grad_norm": 1.0901247921547923, + "learning_rate": 1.9047802768990404e-05, + "loss": 1.0802, + "step": 978 + }, + { + "epoch": 0.16600254345061466, + "grad_norm": 1.0314235423645806, + "learning_rate": 1.9045462353176115e-05, + "loss": 1.0258, + "step": 979 + }, + { + "epoch": 0.1661721068249258, + "grad_norm": 0.9948835587073573, + "learning_rate": 1.904311920877079e-05, + "loss": 1.0121, + "step": 980 + }, + { + "epoch": 0.16634167019923696, + "grad_norm": 1.080837986453593, + "learning_rate": 1.904077333648126e-05, + "loss": 1.0016, + "step": 981 + }, + { + "epoch": 0.1665112335735481, + "grad_norm": 1.005237192138, + "learning_rate": 1.9038424737015144e-05, + "loss": 0.9961, + "step": 982 + }, + { + "epoch": 0.16668079694785926, + "grad_norm": 1.0453587105029483, + "learning_rate": 1.9036073411080917e-05, + "loss": 1.0385, + "step": 983 + }, + { + "epoch": 0.1668503603221704, + "grad_norm": 1.018615226677672, + "learning_rate": 1.9033719359387866e-05, + "loss": 1.0045, + "step": 984 + }, + { + "epoch": 0.16701992369648155, + "grad_norm": 1.0897893243816406, + "learning_rate": 1.903136258264609e-05, + "loss": 1.1031, + "step": 985 + }, + { + "epoch": 0.1671894870707927, + "grad_norm": 1.0516457017084275, + "learning_rate": 1.9029003081566517e-05, + "loss": 1.0653, + "step": 986 + }, + { + "epoch": 0.16735905044510385, + "grad_norm": 1.0443845006885772, + "learning_rate": 1.9026640856860906e-05, + "loss": 1.0211, + "step": 987 + }, + { + "epoch": 0.167528613819415, + "grad_norm": 1.023114656589325, + "learning_rate": 1.9024275909241824e-05, + "loss": 1.0318, + "step": 988 + }, + { + "epoch": 0.16769817719372615, + "grad_norm": 1.069577021077146, + "learning_rate": 1.9021908239422665e-05, + "loss": 1.0172, + "step": 989 + }, + { + "epoch": 0.1678677405680373, + "grad_norm": 1.0441193429697493, + "learning_rate": 1.9019537848117645e-05, + "loss": 1.0408, + "step": 990 + }, + { + "epoch": 0.16803730394234845, + "grad_norm": 1.0704201916746707, + "learning_rate": 1.9017164736041795e-05, + "loss": 1.0382, + "step": 991 + }, + { + "epoch": 0.1682068673166596, + "grad_norm": 1.0859432836965746, + "learning_rate": 1.901478890391098e-05, + "loss": 1.0616, + "step": 992 + }, + { + "epoch": 0.16837643069097075, + "grad_norm": 1.0865061444583897, + "learning_rate": 1.9012410352441866e-05, + "loss": 1.056, + "step": 993 + }, + { + "epoch": 0.1685459940652819, + "grad_norm": 1.0777844748349725, + "learning_rate": 1.901002908235196e-05, + "loss": 1.0626, + "step": 994 + }, + { + "epoch": 0.16871555743959304, + "grad_norm": 1.0316132529230884, + "learning_rate": 1.9007645094359576e-05, + "loss": 1.0703, + "step": 995 + }, + { + "epoch": 0.1688851208139042, + "grad_norm": 0.976761393545048, + "learning_rate": 1.900525838918385e-05, + "loss": 0.987, + "step": 996 + }, + { + "epoch": 0.16905468418821534, + "grad_norm": 1.1449160277983745, + "learning_rate": 1.9002868967544743e-05, + "loss": 1.0615, + "step": 997 + }, + { + "epoch": 0.1692242475625265, + "grad_norm": 1.0210079276096267, + "learning_rate": 1.9000476830163022e-05, + "loss": 1.0399, + "step": 998 + }, + { + "epoch": 0.16939381093683764, + "grad_norm": 1.0355190009838098, + "learning_rate": 1.899808197776029e-05, + "loss": 1.0554, + "step": 999 + }, + { + "epoch": 0.1695633743111488, + "grad_norm": 1.0298578209665756, + "learning_rate": 1.8995684411058965e-05, + "loss": 1.0219, + "step": 1000 + }, + { + "epoch": 0.16973293768545994, + "grad_norm": 1.0822818753800265, + "learning_rate": 1.899328413078227e-05, + "loss": 1.0505, + "step": 1001 + }, + { + "epoch": 0.1699025010597711, + "grad_norm": 1.0399272477537211, + "learning_rate": 1.899088113765426e-05, + "loss": 1.0762, + "step": 1002 + }, + { + "epoch": 0.17007206443408224, + "grad_norm": 1.054593804719732, + "learning_rate": 1.898847543239981e-05, + "loss": 1.0463, + "step": 1003 + }, + { + "epoch": 0.1702416278083934, + "grad_norm": 1.0591632040708472, + "learning_rate": 1.8986067015744605e-05, + "loss": 1.0752, + "step": 1004 + }, + { + "epoch": 0.17041119118270454, + "grad_norm": 1.1428832463362848, + "learning_rate": 1.898365588841515e-05, + "loss": 1.0756, + "step": 1005 + }, + { + "epoch": 0.17058075455701568, + "grad_norm": 0.9936584279273152, + "learning_rate": 1.8981242051138773e-05, + "loss": 0.9864, + "step": 1006 + }, + { + "epoch": 0.17075031793132683, + "grad_norm": 0.9941664303322485, + "learning_rate": 1.897882550464361e-05, + "loss": 0.9816, + "step": 1007 + }, + { + "epoch": 0.17091988130563798, + "grad_norm": 1.0566276172028572, + "learning_rate": 1.8976406249658624e-05, + "loss": 1.0585, + "step": 1008 + }, + { + "epoch": 0.17108944467994913, + "grad_norm": 1.0305138529894537, + "learning_rate": 1.8973984286913584e-05, + "loss": 0.9912, + "step": 1009 + }, + { + "epoch": 0.17125900805426028, + "grad_norm": 1.0661358721764511, + "learning_rate": 1.8971559617139092e-05, + "loss": 1.0728, + "step": 1010 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.0713830701065703, + "learning_rate": 1.896913224106655e-05, + "loss": 1.0394, + "step": 1011 + }, + { + "epoch": 0.17159813480288258, + "grad_norm": 1.0417729740812105, + "learning_rate": 1.8966702159428187e-05, + "loss": 1.0433, + "step": 1012 + }, + { + "epoch": 0.17176769817719373, + "grad_norm": 1.0135984016810344, + "learning_rate": 1.896426937295704e-05, + "loss": 1.0389, + "step": 1013 + }, + { + "epoch": 0.17193726155150488, + "grad_norm": 1.0386784383548087, + "learning_rate": 1.896183388238697e-05, + "loss": 1.0536, + "step": 1014 + }, + { + "epoch": 0.17210682492581603, + "grad_norm": 1.0308367201193958, + "learning_rate": 1.8959395688452648e-05, + "loss": 1.0798, + "step": 1015 + }, + { + "epoch": 0.17227638830012718, + "grad_norm": 1.093693969708606, + "learning_rate": 1.8956954791889567e-05, + "loss": 1.0702, + "step": 1016 + }, + { + "epoch": 0.17244595167443832, + "grad_norm": 1.206461847523564, + "learning_rate": 1.8954511193434024e-05, + "loss": 1.0782, + "step": 1017 + }, + { + "epoch": 0.17261551504874947, + "grad_norm": 1.0313391042663693, + "learning_rate": 1.8952064893823145e-05, + "loss": 1.0279, + "step": 1018 + }, + { + "epoch": 0.17278507842306062, + "grad_norm": 1.0372817079439989, + "learning_rate": 1.8949615893794858e-05, + "loss": 1.0128, + "step": 1019 + }, + { + "epoch": 0.17295464179737177, + "grad_norm": 1.0665050651669081, + "learning_rate": 1.8947164194087912e-05, + "loss": 1.0347, + "step": 1020 + }, + { + "epoch": 0.17312420517168292, + "grad_norm": 1.0907174010029659, + "learning_rate": 1.8944709795441874e-05, + "loss": 1.0463, + "step": 1021 + }, + { + "epoch": 0.17329376854599407, + "grad_norm": 1.0184693548855424, + "learning_rate": 1.8942252698597113e-05, + "loss": 0.9921, + "step": 1022 + }, + { + "epoch": 0.17346333192030522, + "grad_norm": 1.062863685731636, + "learning_rate": 1.893979290429483e-05, + "loss": 1.0486, + "step": 1023 + }, + { + "epoch": 0.17363289529461637, + "grad_norm": 1.074012583014513, + "learning_rate": 1.893733041327702e-05, + "loss": 1.0892, + "step": 1024 + }, + { + "epoch": 0.17380245866892752, + "grad_norm": 1.0518174563438558, + "learning_rate": 1.8934865226286507e-05, + "loss": 1.0453, + "step": 1025 + }, + { + "epoch": 0.17397202204323867, + "grad_norm": 0.9881949718168048, + "learning_rate": 1.8932397344066918e-05, + "loss": 1.0071, + "step": 1026 + }, + { + "epoch": 0.17414158541754982, + "grad_norm": 1.0736471442587825, + "learning_rate": 1.89299267673627e-05, + "loss": 1.011, + "step": 1027 + }, + { + "epoch": 0.17431114879186096, + "grad_norm": 1.1076791922893015, + "learning_rate": 1.8927453496919108e-05, + "loss": 1.0443, + "step": 1028 + }, + { + "epoch": 0.1744807121661721, + "grad_norm": 1.1113268860131447, + "learning_rate": 1.892497753348221e-05, + "loss": 1.0891, + "step": 1029 + }, + { + "epoch": 0.17465027554048326, + "grad_norm": 0.9725135451900143, + "learning_rate": 1.8922498877798893e-05, + "loss": 1.0452, + "step": 1030 + }, + { + "epoch": 0.1748198389147944, + "grad_norm": 1.0903289333819945, + "learning_rate": 1.892001753061685e-05, + "loss": 1.0636, + "step": 1031 + }, + { + "epoch": 0.17498940228910556, + "grad_norm": 1.0477583452992338, + "learning_rate": 1.8917533492684584e-05, + "loss": 1.029, + "step": 1032 + }, + { + "epoch": 0.1751589656634167, + "grad_norm": 1.0542077779157495, + "learning_rate": 1.891504676475141e-05, + "loss": 1.0339, + "step": 1033 + }, + { + "epoch": 0.17532852903772786, + "grad_norm": 0.975375541174198, + "learning_rate": 1.8912557347567462e-05, + "loss": 0.9654, + "step": 1034 + }, + { + "epoch": 0.175498092412039, + "grad_norm": 1.0729001926880484, + "learning_rate": 1.891006524188368e-05, + "loss": 1.0467, + "step": 1035 + }, + { + "epoch": 0.17566765578635016, + "grad_norm": 0.7245216690535998, + "learning_rate": 1.8907570448451812e-05, + "loss": 0.833, + "step": 1036 + }, + { + "epoch": 0.1758372191606613, + "grad_norm": 1.046610098949525, + "learning_rate": 1.8905072968024424e-05, + "loss": 1.0397, + "step": 1037 + }, + { + "epoch": 0.17600678253497246, + "grad_norm": 1.0013872705892883, + "learning_rate": 1.8902572801354887e-05, + "loss": 1.0511, + "step": 1038 + }, + { + "epoch": 0.1761763459092836, + "grad_norm": 1.1029554467376799, + "learning_rate": 1.890006994919738e-05, + "loss": 1.0522, + "step": 1039 + }, + { + "epoch": 0.17634590928359475, + "grad_norm": 1.0508430758660994, + "learning_rate": 1.8897564412306902e-05, + "loss": 1.066, + "step": 1040 + }, + { + "epoch": 0.1765154726579059, + "grad_norm": 1.0647486984009291, + "learning_rate": 1.8895056191439252e-05, + "loss": 1.0652, + "step": 1041 + }, + { + "epoch": 0.17668503603221705, + "grad_norm": 0.9954204664408755, + "learning_rate": 1.8892545287351044e-05, + "loss": 1.0116, + "step": 1042 + }, + { + "epoch": 0.1768545994065282, + "grad_norm": 1.09366622888092, + "learning_rate": 1.8890031700799697e-05, + "loss": 1.042, + "step": 1043 + }, + { + "epoch": 0.17702416278083935, + "grad_norm": 1.1551720096157085, + "learning_rate": 1.8887515432543445e-05, + "loss": 1.0525, + "step": 1044 + }, + { + "epoch": 0.1771937261551505, + "grad_norm": 1.016310177082837, + "learning_rate": 1.888499648334133e-05, + "loss": 0.9738, + "step": 1045 + }, + { + "epoch": 0.17736328952946165, + "grad_norm": 1.0174982197476534, + "learning_rate": 1.8882474853953193e-05, + "loss": 1.0278, + "step": 1046 + }, + { + "epoch": 0.1775328529037728, + "grad_norm": 1.1073615617411334, + "learning_rate": 1.8879950545139697e-05, + "loss": 1.0492, + "step": 1047 + }, + { + "epoch": 0.17770241627808395, + "grad_norm": 1.0461526038661344, + "learning_rate": 1.8877423557662307e-05, + "loss": 1.0333, + "step": 1048 + }, + { + "epoch": 0.1778719796523951, + "grad_norm": 1.068946921015294, + "learning_rate": 1.8874893892283296e-05, + "loss": 1.0265, + "step": 1049 + }, + { + "epoch": 0.17804154302670624, + "grad_norm": 1.0792162961638039, + "learning_rate": 1.887236154976574e-05, + "loss": 1.0697, + "step": 1050 + }, + { + "epoch": 0.1782111064010174, + "grad_norm": 1.0507105850359646, + "learning_rate": 1.8869826530873537e-05, + "loss": 1.0469, + "step": 1051 + }, + { + "epoch": 0.17838066977532852, + "grad_norm": 0.9680010430639013, + "learning_rate": 1.886728883637138e-05, + "loss": 1.007, + "step": 1052 + }, + { + "epoch": 0.17855023314963966, + "grad_norm": 1.0725567627421173, + "learning_rate": 1.886474846702477e-05, + "loss": 1.07, + "step": 1053 + }, + { + "epoch": 0.1787197965239508, + "grad_norm": 1.0459122476351028, + "learning_rate": 1.8862205423600016e-05, + "loss": 1.0941, + "step": 1054 + }, + { + "epoch": 0.17888935989826196, + "grad_norm": 1.0835932332012264, + "learning_rate": 1.8859659706864234e-05, + "loss": 1.0786, + "step": 1055 + }, + { + "epoch": 0.1790589232725731, + "grad_norm": 0.7963938772057977, + "learning_rate": 1.8857111317585354e-05, + "loss": 0.8576, + "step": 1056 + }, + { + "epoch": 0.17922848664688426, + "grad_norm": 1.099042760662731, + "learning_rate": 1.8854560256532098e-05, + "loss": 1.0444, + "step": 1057 + }, + { + "epoch": 0.1793980500211954, + "grad_norm": 1.088615438395254, + "learning_rate": 1.885200652447401e-05, + "loss": 1.0957, + "step": 1058 + }, + { + "epoch": 0.17956761339550656, + "grad_norm": 1.0640060539192748, + "learning_rate": 1.8849450122181422e-05, + "loss": 1.0287, + "step": 1059 + }, + { + "epoch": 0.1797371767698177, + "grad_norm": 1.0842958887706413, + "learning_rate": 1.8846891050425484e-05, + "loss": 1.0449, + "step": 1060 + }, + { + "epoch": 0.17990674014412886, + "grad_norm": 1.102621760318561, + "learning_rate": 1.8844329309978146e-05, + "loss": 1.056, + "step": 1061 + }, + { + "epoch": 0.18007630351844, + "grad_norm": 0.6793547404122261, + "learning_rate": 1.8841764901612167e-05, + "loss": 0.875, + "step": 1062 + }, + { + "epoch": 0.18024586689275116, + "grad_norm": 1.0777371505263507, + "learning_rate": 1.883919782610111e-05, + "loss": 1.0174, + "step": 1063 + }, + { + "epoch": 0.1804154302670623, + "grad_norm": 1.0628359911400063, + "learning_rate": 1.8836628084219332e-05, + "loss": 0.9978, + "step": 1064 + }, + { + "epoch": 0.18058499364137345, + "grad_norm": 1.069653533052602, + "learning_rate": 1.8834055676742018e-05, + "loss": 1.0749, + "step": 1065 + }, + { + "epoch": 0.1807545570156846, + "grad_norm": 1.1587147736029462, + "learning_rate": 1.883148060444513e-05, + "loss": 1.0402, + "step": 1066 + }, + { + "epoch": 0.18092412038999575, + "grad_norm": 1.025117978747166, + "learning_rate": 1.8828902868105454e-05, + "loss": 1.0425, + "step": 1067 + }, + { + "epoch": 0.1810936837643069, + "grad_norm": 1.091496977715169, + "learning_rate": 1.8826322468500567e-05, + "loss": 1.0577, + "step": 1068 + }, + { + "epoch": 0.18126324713861805, + "grad_norm": 0.674673181083146, + "learning_rate": 1.8823739406408855e-05, + "loss": 0.8771, + "step": 1069 + }, + { + "epoch": 0.1814328105129292, + "grad_norm": 1.0763887923881363, + "learning_rate": 1.8821153682609514e-05, + "loss": 1.0412, + "step": 1070 + }, + { + "epoch": 0.18160237388724035, + "grad_norm": 1.0254809783165464, + "learning_rate": 1.8818565297882525e-05, + "loss": 1.0369, + "step": 1071 + }, + { + "epoch": 0.1817719372615515, + "grad_norm": 1.0590738106819182, + "learning_rate": 1.8815974253008687e-05, + "loss": 1.0188, + "step": 1072 + }, + { + "epoch": 0.18194150063586265, + "grad_norm": 0.6373899011466477, + "learning_rate": 1.8813380548769594e-05, + "loss": 0.8633, + "step": 1073 + }, + { + "epoch": 0.1821110640101738, + "grad_norm": 1.0621148660447535, + "learning_rate": 1.8810784185947648e-05, + "loss": 1.0301, + "step": 1074 + }, + { + "epoch": 0.18228062738448494, + "grad_norm": 0.9560958091072056, + "learning_rate": 1.880818516532605e-05, + "loss": 1.0172, + "step": 1075 + }, + { + "epoch": 0.1824501907587961, + "grad_norm": 1.2316105785351477, + "learning_rate": 1.8805583487688796e-05, + "loss": 0.9909, + "step": 1076 + }, + { + "epoch": 0.18261975413310724, + "grad_norm": 1.00080139391916, + "learning_rate": 1.88029791538207e-05, + "loss": 1.0079, + "step": 1077 + }, + { + "epoch": 0.1827893175074184, + "grad_norm": 1.0148533263873454, + "learning_rate": 1.8800372164507358e-05, + "loss": 1.0385, + "step": 1078 + }, + { + "epoch": 0.18295888088172954, + "grad_norm": 1.0659653437505787, + "learning_rate": 1.8797762520535178e-05, + "loss": 1.0733, + "step": 1079 + }, + { + "epoch": 0.1831284442560407, + "grad_norm": 1.0006065098846189, + "learning_rate": 1.8795150222691375e-05, + "loss": 1.0305, + "step": 1080 + }, + { + "epoch": 0.18329800763035184, + "grad_norm": 1.1032054735983126, + "learning_rate": 1.8792535271763944e-05, + "loss": 1.0438, + "step": 1081 + }, + { + "epoch": 0.183467571004663, + "grad_norm": 1.0595412598719354, + "learning_rate": 1.8789917668541707e-05, + "loss": 1.0337, + "step": 1082 + }, + { + "epoch": 0.18363713437897414, + "grad_norm": 1.023972415180868, + "learning_rate": 1.8787297413814257e-05, + "loss": 1.0161, + "step": 1083 + }, + { + "epoch": 0.1838066977532853, + "grad_norm": 1.0596608377381984, + "learning_rate": 1.8784674508372013e-05, + "loss": 1.077, + "step": 1084 + }, + { + "epoch": 0.18397626112759644, + "grad_norm": 1.0901509199604031, + "learning_rate": 1.8782048953006176e-05, + "loss": 1.0071, + "step": 1085 + }, + { + "epoch": 0.18414582450190758, + "grad_norm": 1.1185269003403535, + "learning_rate": 1.8779420748508758e-05, + "loss": 1.0717, + "step": 1086 + }, + { + "epoch": 0.18431538787621873, + "grad_norm": 0.9773051664619716, + "learning_rate": 1.8776789895672557e-05, + "loss": 1.0229, + "step": 1087 + }, + { + "epoch": 0.18448495125052988, + "grad_norm": 1.0180955003574748, + "learning_rate": 1.8774156395291188e-05, + "loss": 1.0412, + "step": 1088 + }, + { + "epoch": 0.18465451462484103, + "grad_norm": 1.0316399436215293, + "learning_rate": 1.8771520248159044e-05, + "loss": 1.0466, + "step": 1089 + }, + { + "epoch": 0.18482407799915218, + "grad_norm": 1.0467293399796018, + "learning_rate": 1.876888145507133e-05, + "loss": 1.0314, + "step": 1090 + }, + { + "epoch": 0.18499364137346333, + "grad_norm": 1.0652202251188496, + "learning_rate": 1.8766240016824056e-05, + "loss": 1.0354, + "step": 1091 + }, + { + "epoch": 0.18516320474777448, + "grad_norm": 1.03473017798002, + "learning_rate": 1.8763595934214004e-05, + "loss": 0.981, + "step": 1092 + }, + { + "epoch": 0.18533276812208563, + "grad_norm": 1.0548904395935859, + "learning_rate": 1.8760949208038782e-05, + "loss": 1.009, + "step": 1093 + }, + { + "epoch": 0.18550233149639678, + "grad_norm": 0.9952437756882081, + "learning_rate": 1.8758299839096774e-05, + "loss": 1.0013, + "step": 1094 + }, + { + "epoch": 0.18567189487070793, + "grad_norm": 1.0098652834135369, + "learning_rate": 1.8755647828187175e-05, + "loss": 0.9756, + "step": 1095 + }, + { + "epoch": 0.18584145824501908, + "grad_norm": 1.1036055794465418, + "learning_rate": 1.875299317610997e-05, + "loss": 1.0424, + "step": 1096 + }, + { + "epoch": 0.18601102161933022, + "grad_norm": 1.032174354868484, + "learning_rate": 1.8750335883665948e-05, + "loss": 1.019, + "step": 1097 + }, + { + "epoch": 0.18618058499364137, + "grad_norm": 1.025468560937419, + "learning_rate": 1.874767595165668e-05, + "loss": 1.0441, + "step": 1098 + }, + { + "epoch": 0.18635014836795252, + "grad_norm": 0.654211416884946, + "learning_rate": 1.874501338088455e-05, + "loss": 0.8489, + "step": 1099 + }, + { + "epoch": 0.18651971174226367, + "grad_norm": 1.079235417889397, + "learning_rate": 1.8742348172152728e-05, + "loss": 1.0267, + "step": 1100 + }, + { + "epoch": 0.18668927511657482, + "grad_norm": 1.107932391473777, + "learning_rate": 1.873968032626518e-05, + "loss": 1.0253, + "step": 1101 + }, + { + "epoch": 0.18685883849088597, + "grad_norm": 1.1030316316228774, + "learning_rate": 1.8737009844026673e-05, + "loss": 1.05, + "step": 1102 + }, + { + "epoch": 0.18702840186519712, + "grad_norm": 1.0119577413469758, + "learning_rate": 1.873433672624277e-05, + "loss": 1.0125, + "step": 1103 + }, + { + "epoch": 0.18719796523950827, + "grad_norm": 1.0783814751395921, + "learning_rate": 1.8731660973719816e-05, + "loss": 1.0085, + "step": 1104 + }, + { + "epoch": 0.18736752861381942, + "grad_norm": 1.1321130047944756, + "learning_rate": 1.8728982587264965e-05, + "loss": 1.0279, + "step": 1105 + }, + { + "epoch": 0.18753709198813057, + "grad_norm": 1.1810796160515018, + "learning_rate": 1.872630156768616e-05, + "loss": 1.0778, + "step": 1106 + }, + { + "epoch": 0.18770665536244172, + "grad_norm": 1.0441024098014227, + "learning_rate": 1.8723617915792136e-05, + "loss": 1.0331, + "step": 1107 + }, + { + "epoch": 0.18787621873675286, + "grad_norm": 1.0817205595077535, + "learning_rate": 1.8720931632392427e-05, + "loss": 1.0468, + "step": 1108 + }, + { + "epoch": 0.188045782111064, + "grad_norm": 1.060619446098569, + "learning_rate": 1.8718242718297358e-05, + "loss": 1.04, + "step": 1109 + }, + { + "epoch": 0.18821534548537516, + "grad_norm": 1.073263648619468, + "learning_rate": 1.8715551174318053e-05, + "loss": 0.9915, + "step": 1110 + }, + { + "epoch": 0.1883849088596863, + "grad_norm": 1.1004794351266534, + "learning_rate": 1.8712857001266417e-05, + "loss": 1.0475, + "step": 1111 + }, + { + "epoch": 0.18855447223399746, + "grad_norm": 1.1189192261883878, + "learning_rate": 1.8710160199955158e-05, + "loss": 1.0336, + "step": 1112 + }, + { + "epoch": 0.1887240356083086, + "grad_norm": 1.018498383470065, + "learning_rate": 1.8707460771197773e-05, + "loss": 0.9956, + "step": 1113 + }, + { + "epoch": 0.18889359898261976, + "grad_norm": 1.0432703900417342, + "learning_rate": 1.8704758715808556e-05, + "loss": 1.0097, + "step": 1114 + }, + { + "epoch": 0.1890631623569309, + "grad_norm": 1.1080203566710376, + "learning_rate": 1.870205403460259e-05, + "loss": 1.0365, + "step": 1115 + }, + { + "epoch": 0.18923272573124206, + "grad_norm": 1.019195808140228, + "learning_rate": 1.8699346728395746e-05, + "loss": 1.0455, + "step": 1116 + }, + { + "epoch": 0.1894022891055532, + "grad_norm": 1.021689392882448, + "learning_rate": 1.8696636798004693e-05, + "loss": 1.0131, + "step": 1117 + }, + { + "epoch": 0.18957185247986436, + "grad_norm": 1.0747533494398365, + "learning_rate": 1.869392424424689e-05, + "loss": 1.0618, + "step": 1118 + }, + { + "epoch": 0.1897414158541755, + "grad_norm": 1.0306890105088173, + "learning_rate": 1.869120906794059e-05, + "loss": 1.0052, + "step": 1119 + }, + { + "epoch": 0.18991097922848665, + "grad_norm": 1.060286016012655, + "learning_rate": 1.868849126990483e-05, + "loss": 1.0246, + "step": 1120 + }, + { + "epoch": 0.1900805426027978, + "grad_norm": 1.0652619450500056, + "learning_rate": 1.8685770850959444e-05, + "loss": 1.0336, + "step": 1121 + }, + { + "epoch": 0.19025010597710895, + "grad_norm": 1.0581830961944134, + "learning_rate": 1.8683047811925057e-05, + "loss": 1.0413, + "step": 1122 + }, + { + "epoch": 0.1904196693514201, + "grad_norm": 1.0537043039203717, + "learning_rate": 1.8680322153623077e-05, + "loss": 1.0176, + "step": 1123 + }, + { + "epoch": 0.19058923272573125, + "grad_norm": 1.0646885508930264, + "learning_rate": 1.8677593876875707e-05, + "loss": 1.0005, + "step": 1124 + }, + { + "epoch": 0.1907587961000424, + "grad_norm": 1.0701149877370713, + "learning_rate": 1.8674862982505946e-05, + "loss": 1.0479, + "step": 1125 + }, + { + "epoch": 0.19092835947435355, + "grad_norm": 1.075439626383217, + "learning_rate": 1.8672129471337568e-05, + "loss": 1.0603, + "step": 1126 + }, + { + "epoch": 0.1910979228486647, + "grad_norm": 1.0337487197458968, + "learning_rate": 1.8669393344195154e-05, + "loss": 1.068, + "step": 1127 + }, + { + "epoch": 0.19126748622297585, + "grad_norm": 1.024203162155067, + "learning_rate": 1.8666654601904058e-05, + "loss": 0.9965, + "step": 1128 + }, + { + "epoch": 0.191437049597287, + "grad_norm": 0.981286089341937, + "learning_rate": 1.8663913245290433e-05, + "loss": 1.0307, + "step": 1129 + }, + { + "epoch": 0.19160661297159814, + "grad_norm": 1.0577088808437018, + "learning_rate": 1.8661169275181217e-05, + "loss": 1.0364, + "step": 1130 + }, + { + "epoch": 0.1917761763459093, + "grad_norm": 1.0227399863124151, + "learning_rate": 1.8658422692404136e-05, + "loss": 1.0129, + "step": 1131 + }, + { + "epoch": 0.19194573972022044, + "grad_norm": 1.012120396741532, + "learning_rate": 1.8655673497787708e-05, + "loss": 1.0184, + "step": 1132 + }, + { + "epoch": 0.1921153030945316, + "grad_norm": 0.9947006175914488, + "learning_rate": 1.8652921692161233e-05, + "loss": 0.9979, + "step": 1133 + }, + { + "epoch": 0.19228486646884274, + "grad_norm": 1.070070395782131, + "learning_rate": 1.8650167276354802e-05, + "loss": 1.025, + "step": 1134 + }, + { + "epoch": 0.1924544298431539, + "grad_norm": 1.0541155125979846, + "learning_rate": 1.864741025119929e-05, + "loss": 1.0103, + "step": 1135 + }, + { + "epoch": 0.19262399321746504, + "grad_norm": 1.0532929465591445, + "learning_rate": 1.8644650617526366e-05, + "loss": 1.014, + "step": 1136 + }, + { + "epoch": 0.1927935565917762, + "grad_norm": 1.0071293447802863, + "learning_rate": 1.8641888376168483e-05, + "loss": 0.99, + "step": 1137 + }, + { + "epoch": 0.19296311996608734, + "grad_norm": 1.1374644443189281, + "learning_rate": 1.8639123527958877e-05, + "loss": 1.0784, + "step": 1138 + }, + { + "epoch": 0.1931326833403985, + "grad_norm": 1.0120430243294007, + "learning_rate": 1.863635607373157e-05, + "loss": 1.0229, + "step": 1139 + }, + { + "epoch": 0.19330224671470964, + "grad_norm": 1.007238215765678, + "learning_rate": 1.8633586014321378e-05, + "loss": 1.0051, + "step": 1140 + }, + { + "epoch": 0.19347181008902078, + "grad_norm": 1.0760687459740548, + "learning_rate": 1.8630813350563898e-05, + "loss": 1.0396, + "step": 1141 + }, + { + "epoch": 0.19364137346333193, + "grad_norm": 0.9648749524421074, + "learning_rate": 1.8628038083295508e-05, + "loss": 1.0246, + "step": 1142 + }, + { + "epoch": 0.19381093683764306, + "grad_norm": 1.0100154097262342, + "learning_rate": 1.862526021335338e-05, + "loss": 1.0294, + "step": 1143 + }, + { + "epoch": 0.1939805002119542, + "grad_norm": 1.0030581935913696, + "learning_rate": 1.862247974157546e-05, + "loss": 1.0629, + "step": 1144 + }, + { + "epoch": 0.19415006358626535, + "grad_norm": 0.6574111899337789, + "learning_rate": 1.8619696668800494e-05, + "loss": 0.8062, + "step": 1145 + }, + { + "epoch": 0.1943196269605765, + "grad_norm": 0.9868517241345818, + "learning_rate": 1.8616910995868e-05, + "loss": 1.0551, + "step": 1146 + }, + { + "epoch": 0.19448919033488765, + "grad_norm": 1.0024926362794524, + "learning_rate": 1.8614122723618284e-05, + "loss": 0.9869, + "step": 1147 + }, + { + "epoch": 0.1946587537091988, + "grad_norm": 0.9585164855704453, + "learning_rate": 1.861133185289244e-05, + "loss": 1.0078, + "step": 1148 + }, + { + "epoch": 0.19482831708350995, + "grad_norm": 0.6478518330323907, + "learning_rate": 1.860853838453234e-05, + "loss": 0.8569, + "step": 1149 + }, + { + "epoch": 0.1949978804578211, + "grad_norm": 1.0507163935200776, + "learning_rate": 1.8605742319380643e-05, + "loss": 1.0295, + "step": 1150 + }, + { + "epoch": 0.19516744383213225, + "grad_norm": 1.0342546738444225, + "learning_rate": 1.860294365828079e-05, + "loss": 1.0313, + "step": 1151 + }, + { + "epoch": 0.1953370072064434, + "grad_norm": 1.0106613728921512, + "learning_rate": 1.8600142402077006e-05, + "loss": 1.0463, + "step": 1152 + }, + { + "epoch": 0.19550657058075455, + "grad_norm": 1.0431592337964368, + "learning_rate": 1.85973385516143e-05, + "loss": 1.0307, + "step": 1153 + }, + { + "epoch": 0.1956761339550657, + "grad_norm": 0.9890348975975835, + "learning_rate": 1.8594532107738458e-05, + "loss": 1.0299, + "step": 1154 + }, + { + "epoch": 0.19584569732937684, + "grad_norm": 1.0526313650412848, + "learning_rate": 1.8591723071296054e-05, + "loss": 1.0456, + "step": 1155 + }, + { + "epoch": 0.196015260703688, + "grad_norm": 1.0224443765890652, + "learning_rate": 1.858891144313445e-05, + "loss": 1.0192, + "step": 1156 + }, + { + "epoch": 0.19618482407799914, + "grad_norm": 1.059582021809335, + "learning_rate": 1.8586097224101767e-05, + "loss": 1.0665, + "step": 1157 + }, + { + "epoch": 0.1963543874523103, + "grad_norm": 1.0487068051230162, + "learning_rate": 1.858328041504693e-05, + "loss": 0.9736, + "step": 1158 + }, + { + "epoch": 0.19652395082662144, + "grad_norm": 1.0800321343509853, + "learning_rate": 1.858046101681964e-05, + "loss": 1.0453, + "step": 1159 + }, + { + "epoch": 0.1966935142009326, + "grad_norm": 1.0425064305226417, + "learning_rate": 1.857763903027038e-05, + "loss": 1.0246, + "step": 1160 + }, + { + "epoch": 0.19686307757524374, + "grad_norm": 1.1265201763594452, + "learning_rate": 1.8574814456250406e-05, + "loss": 1.0463, + "step": 1161 + }, + { + "epoch": 0.1970326409495549, + "grad_norm": 1.034409127391492, + "learning_rate": 1.8571987295611756e-05, + "loss": 1.0062, + "step": 1162 + }, + { + "epoch": 0.19720220432386604, + "grad_norm": 1.0628138767441144, + "learning_rate": 1.8569157549207256e-05, + "loss": 1.0689, + "step": 1163 + }, + { + "epoch": 0.1973717676981772, + "grad_norm": 1.0982976324276519, + "learning_rate": 1.8566325217890505e-05, + "loss": 1.0464, + "step": 1164 + }, + { + "epoch": 0.19754133107248834, + "grad_norm": 1.07130372592087, + "learning_rate": 1.856349030251589e-05, + "loss": 1.0516, + "step": 1165 + }, + { + "epoch": 0.19771089444679948, + "grad_norm": 1.0642842358486948, + "learning_rate": 1.856065280393857e-05, + "loss": 1.0305, + "step": 1166 + }, + { + "epoch": 0.19788045782111063, + "grad_norm": 1.0634295951779795, + "learning_rate": 1.8557812723014476e-05, + "loss": 1.0268, + "step": 1167 + }, + { + "epoch": 0.19805002119542178, + "grad_norm": 1.0184802102487613, + "learning_rate": 1.8554970060600338e-05, + "loss": 1.0295, + "step": 1168 + }, + { + "epoch": 0.19821958456973293, + "grad_norm": 1.0338580087698006, + "learning_rate": 1.855212481755365e-05, + "loss": 1.0309, + "step": 1169 + }, + { + "epoch": 0.19838914794404408, + "grad_norm": 1.059172867948832, + "learning_rate": 1.8549276994732684e-05, + "loss": 1.0384, + "step": 1170 + }, + { + "epoch": 0.19855871131835523, + "grad_norm": 1.096866449967696, + "learning_rate": 1.85464265929965e-05, + "loss": 1.0312, + "step": 1171 + }, + { + "epoch": 0.19872827469266638, + "grad_norm": 1.0209748916916948, + "learning_rate": 1.854357361320493e-05, + "loss": 1.0169, + "step": 1172 + }, + { + "epoch": 0.19889783806697753, + "grad_norm": 1.0691953369004308, + "learning_rate": 1.854071805621858e-05, + "loss": 1.0577, + "step": 1173 + }, + { + "epoch": 0.19906740144128868, + "grad_norm": 1.0031412086315579, + "learning_rate": 1.853785992289884e-05, + "loss": 0.979, + "step": 1174 + }, + { + "epoch": 0.19923696481559983, + "grad_norm": 1.0179558637017458, + "learning_rate": 1.8534999214107878e-05, + "loss": 1.0762, + "step": 1175 + }, + { + "epoch": 0.19940652818991098, + "grad_norm": 1.0656437355325812, + "learning_rate": 1.8532135930708626e-05, + "loss": 1.053, + "step": 1176 + }, + { + "epoch": 0.19957609156422212, + "grad_norm": 1.0261713645735637, + "learning_rate": 1.852927007356481e-05, + "loss": 1.0473, + "step": 1177 + }, + { + "epoch": 0.19974565493853327, + "grad_norm": 1.0831025933827432, + "learning_rate": 1.8526401643540924e-05, + "loss": 1.0403, + "step": 1178 + }, + { + "epoch": 0.19991521831284442, + "grad_norm": 1.0519500648356486, + "learning_rate": 1.8523530641502234e-05, + "loss": 1.0368, + "step": 1179 + }, + { + "epoch": 0.20008478168715557, + "grad_norm": 1.0773293076388273, + "learning_rate": 1.8520657068314792e-05, + "loss": 1.0363, + "step": 1180 + }, + { + "epoch": 0.20025434506146672, + "grad_norm": 1.0021809780492006, + "learning_rate": 1.8517780924845415e-05, + "loss": 1.0338, + "step": 1181 + }, + { + "epoch": 0.20042390843577787, + "grad_norm": 1.0935346068038074, + "learning_rate": 1.8514902211961704e-05, + "loss": 1.0922, + "step": 1182 + }, + { + "epoch": 0.20059347181008902, + "grad_norm": 1.050889657682733, + "learning_rate": 1.851202093053203e-05, + "loss": 1.0257, + "step": 1183 + }, + { + "epoch": 0.20076303518440017, + "grad_norm": 1.0556391086169303, + "learning_rate": 1.8509137081425538e-05, + "loss": 1.0168, + "step": 1184 + }, + { + "epoch": 0.20093259855871132, + "grad_norm": 1.035397993089505, + "learning_rate": 1.8506250665512156e-05, + "loss": 1.0164, + "step": 1185 + }, + { + "epoch": 0.20110216193302247, + "grad_norm": 1.0290741634325278, + "learning_rate": 1.8503361683662575e-05, + "loss": 1.0237, + "step": 1186 + }, + { + "epoch": 0.20127172530733362, + "grad_norm": 1.0748063179541278, + "learning_rate": 1.8500470136748267e-05, + "loss": 1.0373, + "step": 1187 + }, + { + "epoch": 0.20144128868164476, + "grad_norm": 1.0662121318492281, + "learning_rate": 1.849757602564147e-05, + "loss": 1.0714, + "step": 1188 + }, + { + "epoch": 0.2016108520559559, + "grad_norm": 1.0292944512112419, + "learning_rate": 1.8494679351215212e-05, + "loss": 1.0551, + "step": 1189 + }, + { + "epoch": 0.20178041543026706, + "grad_norm": 0.9895423228539304, + "learning_rate": 1.8491780114343275e-05, + "loss": 0.9593, + "step": 1190 + }, + { + "epoch": 0.2019499788045782, + "grad_norm": 1.1010847125002454, + "learning_rate": 1.8488878315900228e-05, + "loss": 1.0348, + "step": 1191 + }, + { + "epoch": 0.20211954217888936, + "grad_norm": 1.0697923081803158, + "learning_rate": 1.8485973956761402e-05, + "loss": 1.0334, + "step": 1192 + }, + { + "epoch": 0.2022891055532005, + "grad_norm": 1.0641982135941415, + "learning_rate": 1.848306703780291e-05, + "loss": 1.037, + "step": 1193 + }, + { + "epoch": 0.20245866892751166, + "grad_norm": 1.02711613493122, + "learning_rate": 1.8480157559901635e-05, + "loss": 0.999, + "step": 1194 + }, + { + "epoch": 0.2026282323018228, + "grad_norm": 1.0890737466188205, + "learning_rate": 1.847724552393522e-05, + "loss": 1.0396, + "step": 1195 + }, + { + "epoch": 0.20279779567613396, + "grad_norm": 1.049366214873702, + "learning_rate": 1.8474330930782102e-05, + "loss": 1.0303, + "step": 1196 + }, + { + "epoch": 0.2029673590504451, + "grad_norm": 1.0088642954511193, + "learning_rate": 1.847141378132147e-05, + "loss": 1.0, + "step": 1197 + }, + { + "epoch": 0.20313692242475626, + "grad_norm": 1.0144124285540683, + "learning_rate": 1.8468494076433287e-05, + "loss": 1.0519, + "step": 1198 + }, + { + "epoch": 0.2033064857990674, + "grad_norm": 0.9873543663400495, + "learning_rate": 1.8465571816998296e-05, + "loss": 1.017, + "step": 1199 + }, + { + "epoch": 0.20347604917337855, + "grad_norm": 1.0409487868350844, + "learning_rate": 1.8462647003898005e-05, + "loss": 1.0184, + "step": 1200 + }, + { + "epoch": 0.2036456125476897, + "grad_norm": 1.042680287184133, + "learning_rate": 1.8459719638014693e-05, + "loss": 1.0722, + "step": 1201 + }, + { + "epoch": 0.20381517592200085, + "grad_norm": 1.0413534357481726, + "learning_rate": 1.845678972023141e-05, + "loss": 1.0861, + "step": 1202 + }, + { + "epoch": 0.203984739296312, + "grad_norm": 1.014182308410673, + "learning_rate": 1.845385725143197e-05, + "loss": 1.0182, + "step": 1203 + }, + { + "epoch": 0.20415430267062315, + "grad_norm": 1.0535576454817985, + "learning_rate": 1.8450922232500966e-05, + "loss": 1.0379, + "step": 1204 + }, + { + "epoch": 0.2043238660449343, + "grad_norm": 1.001785214578283, + "learning_rate": 1.844798466432375e-05, + "loss": 1.0408, + "step": 1205 + }, + { + "epoch": 0.20449342941924545, + "grad_norm": 1.018852245090246, + "learning_rate": 1.8445044547786453e-05, + "loss": 1.048, + "step": 1206 + }, + { + "epoch": 0.2046629927935566, + "grad_norm": 1.0586772410639393, + "learning_rate": 1.844210188377597e-05, + "loss": 1.0208, + "step": 1207 + }, + { + "epoch": 0.20483255616786775, + "grad_norm": 1.017886865946593, + "learning_rate": 1.843915667317996e-05, + "loss": 1.0257, + "step": 1208 + }, + { + "epoch": 0.2050021195421789, + "grad_norm": 0.9896355726950156, + "learning_rate": 1.843620891688686e-05, + "loss": 0.9956, + "step": 1209 + }, + { + "epoch": 0.20517168291649004, + "grad_norm": 0.6889570278504524, + "learning_rate": 1.8433258615785865e-05, + "loss": 0.8352, + "step": 1210 + }, + { + "epoch": 0.2053412462908012, + "grad_norm": 0.5971048848971885, + "learning_rate": 1.8430305770766947e-05, + "loss": 0.8228, + "step": 1211 + }, + { + "epoch": 0.20551080966511234, + "grad_norm": 1.0528300584534787, + "learning_rate": 1.842735038272084e-05, + "loss": 1.0676, + "step": 1212 + }, + { + "epoch": 0.2056803730394235, + "grad_norm": 1.0392176737674814, + "learning_rate": 1.842439245253904e-05, + "loss": 1.0844, + "step": 1213 + }, + { + "epoch": 0.20584993641373464, + "grad_norm": 1.0205284637979066, + "learning_rate": 1.842143198111382e-05, + "loss": 1.0115, + "step": 1214 + }, + { + "epoch": 0.2060194997880458, + "grad_norm": 1.0679557115598792, + "learning_rate": 1.841846896933821e-05, + "loss": 1.0161, + "step": 1215 + }, + { + "epoch": 0.20618906316235694, + "grad_norm": 1.032645681620988, + "learning_rate": 1.841550341810602e-05, + "loss": 1.0346, + "step": 1216 + }, + { + "epoch": 0.2063586265366681, + "grad_norm": 1.0465575650502466, + "learning_rate": 1.8412535328311813e-05, + "loss": 1.041, + "step": 1217 + }, + { + "epoch": 0.20652818991097924, + "grad_norm": 1.0808510629114338, + "learning_rate": 1.8409564700850923e-05, + "loss": 1.007, + "step": 1218 + }, + { + "epoch": 0.2066977532852904, + "grad_norm": 1.0515010233070772, + "learning_rate": 1.8406591536619448e-05, + "loss": 0.9982, + "step": 1219 + }, + { + "epoch": 0.20686731665960154, + "grad_norm": 1.0389335588330406, + "learning_rate": 1.8403615836514254e-05, + "loss": 1.053, + "step": 1220 + }, + { + "epoch": 0.20703688003391268, + "grad_norm": 1.0255261956531991, + "learning_rate": 1.840063760143297e-05, + "loss": 1.0294, + "step": 1221 + }, + { + "epoch": 0.20720644340822383, + "grad_norm": 1.0171415382541182, + "learning_rate": 1.8397656832273982e-05, + "loss": 1.0229, + "step": 1222 + }, + { + "epoch": 0.20737600678253498, + "grad_norm": 1.079866293032394, + "learning_rate": 1.8394673529936454e-05, + "loss": 1.0569, + "step": 1223 + }, + { + "epoch": 0.20754557015684613, + "grad_norm": 1.0158664268331172, + "learning_rate": 1.839168769532031e-05, + "loss": 1.0367, + "step": 1224 + }, + { + "epoch": 0.20771513353115728, + "grad_norm": 1.0335426450900382, + "learning_rate": 1.8388699329326237e-05, + "loss": 1.0273, + "step": 1225 + }, + { + "epoch": 0.20788469690546843, + "grad_norm": 1.0208568707443753, + "learning_rate": 1.838570843285568e-05, + "loss": 1.0213, + "step": 1226 + }, + { + "epoch": 0.20805426027977958, + "grad_norm": 1.0448858748525611, + "learning_rate": 1.8382715006810853e-05, + "loss": 1.05, + "step": 1227 + }, + { + "epoch": 0.20822382365409073, + "grad_norm": 1.0329047358240666, + "learning_rate": 1.837971905209473e-05, + "loss": 1.0184, + "step": 1228 + }, + { + "epoch": 0.20839338702840188, + "grad_norm": 1.0066746878062711, + "learning_rate": 1.8376720569611057e-05, + "loss": 0.996, + "step": 1229 + }, + { + "epoch": 0.20856295040271303, + "grad_norm": 1.035072265823961, + "learning_rate": 1.837371956026433e-05, + "loss": 1.0638, + "step": 1230 + }, + { + "epoch": 0.20873251377702418, + "grad_norm": 1.0480752588150497, + "learning_rate": 1.8370716024959812e-05, + "loss": 1.0547, + "step": 1231 + }, + { + "epoch": 0.20890207715133532, + "grad_norm": 1.006328704350034, + "learning_rate": 1.8367709964603528e-05, + "loss": 0.9756, + "step": 1232 + }, + { + "epoch": 0.20907164052564647, + "grad_norm": 1.0200811040945212, + "learning_rate": 1.8364701380102267e-05, + "loss": 1.0433, + "step": 1233 + }, + { + "epoch": 0.20924120389995762, + "grad_norm": 0.9891289209402269, + "learning_rate": 1.8361690272363583e-05, + "loss": 1.0438, + "step": 1234 + }, + { + "epoch": 0.20941076727426874, + "grad_norm": 1.02658165754576, + "learning_rate": 1.8358676642295775e-05, + "loss": 1.0768, + "step": 1235 + }, + { + "epoch": 0.2095803306485799, + "grad_norm": 1.030556044820343, + "learning_rate": 1.835566049080792e-05, + "loss": 1.0383, + "step": 1236 + }, + { + "epoch": 0.20974989402289104, + "grad_norm": 1.0630707848002783, + "learning_rate": 1.835264181880985e-05, + "loss": 1.0503, + "step": 1237 + }, + { + "epoch": 0.2099194573972022, + "grad_norm": 1.063608633053865, + "learning_rate": 1.834962062721215e-05, + "loss": 1.0648, + "step": 1238 + }, + { + "epoch": 0.21008902077151334, + "grad_norm": 1.0575464474083969, + "learning_rate": 1.8346596916926183e-05, + "loss": 1.0503, + "step": 1239 + }, + { + "epoch": 0.2102585841458245, + "grad_norm": 0.9850359702459629, + "learning_rate": 1.834357068886405e-05, + "loss": 1.0193, + "step": 1240 + }, + { + "epoch": 0.21042814752013564, + "grad_norm": 1.1044158198046359, + "learning_rate": 1.8340541943938623e-05, + "loss": 1.0213, + "step": 1241 + }, + { + "epoch": 0.2105977108944468, + "grad_norm": 1.0777778064264738, + "learning_rate": 1.8337510683063536e-05, + "loss": 1.0324, + "step": 1242 + }, + { + "epoch": 0.21076727426875794, + "grad_norm": 1.008427484039492, + "learning_rate": 1.8334476907153177e-05, + "loss": 1.0214, + "step": 1243 + }, + { + "epoch": 0.2109368376430691, + "grad_norm": 1.0786831769768992, + "learning_rate": 1.8331440617122694e-05, + "loss": 1.0185, + "step": 1244 + }, + { + "epoch": 0.21110640101738024, + "grad_norm": 1.0500271329430848, + "learning_rate": 1.8328401813887994e-05, + "loss": 1.0481, + "step": 1245 + }, + { + "epoch": 0.21127596439169138, + "grad_norm": 1.040386126783691, + "learning_rate": 1.8325360498365736e-05, + "loss": 0.9958, + "step": 1246 + }, + { + "epoch": 0.21144552776600253, + "grad_norm": 1.0613500534183766, + "learning_rate": 1.8322316671473344e-05, + "loss": 1.0321, + "step": 1247 + }, + { + "epoch": 0.21161509114031368, + "grad_norm": 1.032326639314487, + "learning_rate": 1.8319270334129e-05, + "loss": 1.0736, + "step": 1248 + }, + { + "epoch": 0.21178465451462483, + "grad_norm": 1.0136031135159627, + "learning_rate": 1.831622148725164e-05, + "loss": 1.02, + "step": 1249 + }, + { + "epoch": 0.21195421788893598, + "grad_norm": 1.002767274048018, + "learning_rate": 1.8313170131760956e-05, + "loss": 0.9849, + "step": 1250 + }, + { + "epoch": 0.21212378126324713, + "grad_norm": 1.0736693800179298, + "learning_rate": 1.8310116268577403e-05, + "loss": 1.0423, + "step": 1251 + }, + { + "epoch": 0.21229334463755828, + "grad_norm": 1.074661518154584, + "learning_rate": 1.8307059898622184e-05, + "loss": 1.0314, + "step": 1252 + }, + { + "epoch": 0.21246290801186943, + "grad_norm": 0.9913767210688825, + "learning_rate": 1.8304001022817267e-05, + "loss": 0.9776, + "step": 1253 + }, + { + "epoch": 0.21263247138618058, + "grad_norm": 1.2394338711259665, + "learning_rate": 1.8300939642085363e-05, + "loss": 1.0383, + "step": 1254 + }, + { + "epoch": 0.21280203476049173, + "grad_norm": 1.0684339650916157, + "learning_rate": 1.829787575734995e-05, + "loss": 1.0058, + "step": 1255 + }, + { + "epoch": 0.21297159813480288, + "grad_norm": 1.013255762764285, + "learning_rate": 1.8294809369535265e-05, + "loss": 1.0278, + "step": 1256 + }, + { + "epoch": 0.21314116150911402, + "grad_norm": 1.0789297314983195, + "learning_rate": 1.8291740479566286e-05, + "loss": 1.0317, + "step": 1257 + }, + { + "epoch": 0.21331072488342517, + "grad_norm": 1.0657885097766648, + "learning_rate": 1.8288669088368754e-05, + "loss": 1.0063, + "step": 1258 + }, + { + "epoch": 0.21348028825773632, + "grad_norm": 1.0475686820281338, + "learning_rate": 1.8285595196869162e-05, + "loss": 1.0165, + "step": 1259 + }, + { + "epoch": 0.21364985163204747, + "grad_norm": 1.0400619700944447, + "learning_rate": 1.828251880599476e-05, + "loss": 1.064, + "step": 1260 + }, + { + "epoch": 0.21381941500635862, + "grad_norm": 1.0364421381237077, + "learning_rate": 1.8279439916673553e-05, + "loss": 1.0253, + "step": 1261 + }, + { + "epoch": 0.21398897838066977, + "grad_norm": 0.9511034456224944, + "learning_rate": 1.8276358529834296e-05, + "loss": 0.8758, + "step": 1262 + }, + { + "epoch": 0.21415854175498092, + "grad_norm": 1.0576833409232267, + "learning_rate": 1.8273274646406494e-05, + "loss": 1.037, + "step": 1263 + }, + { + "epoch": 0.21432810512929207, + "grad_norm": 1.0640317479434482, + "learning_rate": 1.827018826732042e-05, + "loss": 1.0196, + "step": 1264 + }, + { + "epoch": 0.21449766850360322, + "grad_norm": 1.0387038290415187, + "learning_rate": 1.8267099393507083e-05, + "loss": 1.0491, + "step": 1265 + }, + { + "epoch": 0.21466723187791437, + "grad_norm": 0.9958605931383211, + "learning_rate": 1.8264008025898248e-05, + "loss": 1.0077, + "step": 1266 + }, + { + "epoch": 0.21483679525222552, + "grad_norm": 1.0251389982241148, + "learning_rate": 1.826091416542644e-05, + "loss": 0.9803, + "step": 1267 + }, + { + "epoch": 0.21500635862653666, + "grad_norm": 1.0730115868136572, + "learning_rate": 1.8257817813024927e-05, + "loss": 1.0296, + "step": 1268 + }, + { + "epoch": 0.2151759220008478, + "grad_norm": 0.9972935578412699, + "learning_rate": 1.825471896962774e-05, + "loss": 1.0299, + "step": 1269 + }, + { + "epoch": 0.21534548537515896, + "grad_norm": 1.1478958673086375, + "learning_rate": 1.825161763616965e-05, + "loss": 1.0628, + "step": 1270 + }, + { + "epoch": 0.2155150487494701, + "grad_norm": 1.0272513646849883, + "learning_rate": 1.8248513813586186e-05, + "loss": 1.0026, + "step": 1271 + }, + { + "epoch": 0.21568461212378126, + "grad_norm": 1.0427956863888466, + "learning_rate": 1.8245407502813624e-05, + "loss": 1.0443, + "step": 1272 + }, + { + "epoch": 0.2158541754980924, + "grad_norm": 0.9744718706270733, + "learning_rate": 1.8242298704788988e-05, + "loss": 1.0171, + "step": 1273 + }, + { + "epoch": 0.21602373887240356, + "grad_norm": 1.0930015929741534, + "learning_rate": 1.8239187420450063e-05, + "loss": 1.0521, + "step": 1274 + }, + { + "epoch": 0.2161933022467147, + "grad_norm": 1.0571760423315228, + "learning_rate": 1.823607365073537e-05, + "loss": 1.035, + "step": 1275 + }, + { + "epoch": 0.21636286562102586, + "grad_norm": 1.0650238078881098, + "learning_rate": 1.82329573965842e-05, + "loss": 1.0495, + "step": 1276 + }, + { + "epoch": 0.216532428995337, + "grad_norm": 1.0799671215327404, + "learning_rate": 1.8229838658936566e-05, + "loss": 1.0541, + "step": 1277 + }, + { + "epoch": 0.21670199236964816, + "grad_norm": 1.0583594622587944, + "learning_rate": 1.822671743873325e-05, + "loss": 1.0446, + "step": 1278 + }, + { + "epoch": 0.2168715557439593, + "grad_norm": 1.036215478206204, + "learning_rate": 1.822359373691578e-05, + "loss": 1.0239, + "step": 1279 + }, + { + "epoch": 0.21704111911827045, + "grad_norm": 0.9852319012496498, + "learning_rate": 1.822046755442643e-05, + "loss": 0.8963, + "step": 1280 + }, + { + "epoch": 0.2172106824925816, + "grad_norm": 1.0482599829993722, + "learning_rate": 1.821733889220822e-05, + "loss": 1.0763, + "step": 1281 + }, + { + "epoch": 0.21738024586689275, + "grad_norm": 1.0851643841701206, + "learning_rate": 1.8214207751204917e-05, + "loss": 0.9867, + "step": 1282 + }, + { + "epoch": 0.2175498092412039, + "grad_norm": 1.1063757152775864, + "learning_rate": 1.8211074132361046e-05, + "loss": 1.0446, + "step": 1283 + }, + { + "epoch": 0.21771937261551505, + "grad_norm": 0.9946630095404888, + "learning_rate": 1.820793803662187e-05, + "loss": 1.0402, + "step": 1284 + }, + { + "epoch": 0.2178889359898262, + "grad_norm": 1.0358191690781497, + "learning_rate": 1.82047994649334e-05, + "loss": 1.0398, + "step": 1285 + }, + { + "epoch": 0.21805849936413735, + "grad_norm": 1.0467543169481737, + "learning_rate": 1.8201658418242397e-05, + "loss": 1.0851, + "step": 1286 + }, + { + "epoch": 0.2182280627384485, + "grad_norm": 1.0815008351611577, + "learning_rate": 1.819851489749637e-05, + "loss": 1.0186, + "step": 1287 + }, + { + "epoch": 0.21839762611275965, + "grad_norm": 1.063526233148851, + "learning_rate": 1.8195368903643565e-05, + "loss": 0.9884, + "step": 1288 + }, + { + "epoch": 0.2185671894870708, + "grad_norm": 1.0722804241887995, + "learning_rate": 1.819222043763299e-05, + "loss": 1.0471, + "step": 1289 + }, + { + "epoch": 0.21873675286138194, + "grad_norm": 0.9996597507244086, + "learning_rate": 1.8189069500414377e-05, + "loss": 1.0027, + "step": 1290 + }, + { + "epoch": 0.2189063162356931, + "grad_norm": 1.026944656527998, + "learning_rate": 1.8185916092938226e-05, + "loss": 1.0062, + "step": 1291 + }, + { + "epoch": 0.21907587961000424, + "grad_norm": 1.0474907531783555, + "learning_rate": 1.8182760216155766e-05, + "loss": 1.0553, + "step": 1292 + }, + { + "epoch": 0.2192454429843154, + "grad_norm": 1.0885964226662685, + "learning_rate": 1.8179601871018983e-05, + "loss": 1.0336, + "step": 1293 + }, + { + "epoch": 0.21941500635862654, + "grad_norm": 1.020211263857882, + "learning_rate": 1.8176441058480594e-05, + "loss": 0.9855, + "step": 1294 + }, + { + "epoch": 0.2195845697329377, + "grad_norm": 1.0762160904577527, + "learning_rate": 1.817327777949407e-05, + "loss": 1.043, + "step": 1295 + }, + { + "epoch": 0.21975413310724884, + "grad_norm": 0.771564105076421, + "learning_rate": 1.817011203501363e-05, + "loss": 0.8648, + "step": 1296 + }, + { + "epoch": 0.21992369648156, + "grad_norm": 1.0829557616152723, + "learning_rate": 1.816694382599422e-05, + "loss": 1.0714, + "step": 1297 + }, + { + "epoch": 0.22009325985587114, + "grad_norm": 1.0379107415970839, + "learning_rate": 1.8163773153391548e-05, + "loss": 1.0217, + "step": 1298 + }, + { + "epoch": 0.2202628232301823, + "grad_norm": 1.03751594999782, + "learning_rate": 1.816060001816205e-05, + "loss": 1.0545, + "step": 1299 + }, + { + "epoch": 0.22043238660449344, + "grad_norm": 1.076376175821764, + "learning_rate": 1.8157424421262918e-05, + "loss": 1.0029, + "step": 1300 + }, + { + "epoch": 0.22060194997880458, + "grad_norm": 1.0418823182472614, + "learning_rate": 1.815424636365208e-05, + "loss": 1.0051, + "step": 1301 + }, + { + "epoch": 0.22077151335311573, + "grad_norm": 0.971363472594629, + "learning_rate": 1.81510658462882e-05, + "loss": 1.0356, + "step": 1302 + }, + { + "epoch": 0.22094107672742688, + "grad_norm": 0.6288154148296818, + "learning_rate": 1.81478828701307e-05, + "loss": 0.8789, + "step": 1303 + }, + { + "epoch": 0.22111064010173803, + "grad_norm": 1.0755855898186446, + "learning_rate": 1.8144697436139725e-05, + "loss": 1.0587, + "step": 1304 + }, + { + "epoch": 0.22128020347604918, + "grad_norm": 1.0813624759267786, + "learning_rate": 1.814150954527618e-05, + "loss": 1.0093, + "step": 1305 + }, + { + "epoch": 0.22144976685036033, + "grad_norm": 1.0004629949744186, + "learning_rate": 1.8138319198501694e-05, + "loss": 0.9996, + "step": 1306 + }, + { + "epoch": 0.22161933022467148, + "grad_norm": 1.0241480674027421, + "learning_rate": 1.8135126396778652e-05, + "loss": 0.9763, + "step": 1307 + }, + { + "epoch": 0.22178889359898263, + "grad_norm": 0.9968823716067734, + "learning_rate": 1.8131931141070166e-05, + "loss": 1.0115, + "step": 1308 + }, + { + "epoch": 0.22195845697329378, + "grad_norm": 1.148109905828744, + "learning_rate": 1.8128733432340095e-05, + "loss": 1.0829, + "step": 1309 + }, + { + "epoch": 0.22212802034760493, + "grad_norm": 1.0447363093685365, + "learning_rate": 1.8125533271553045e-05, + "loss": 1.0574, + "step": 1310 + }, + { + "epoch": 0.22229758372191608, + "grad_norm": 1.0181360585746964, + "learning_rate": 1.8122330659674345e-05, + "loss": 1.0344, + "step": 1311 + }, + { + "epoch": 0.22246714709622722, + "grad_norm": 0.9959337186795689, + "learning_rate": 1.8119125597670075e-05, + "loss": 1.0118, + "step": 1312 + }, + { + "epoch": 0.22263671047053837, + "grad_norm": 1.0676401345366784, + "learning_rate": 1.8115918086507054e-05, + "loss": 1.0299, + "step": 1313 + }, + { + "epoch": 0.22280627384484952, + "grad_norm": 1.050405734843094, + "learning_rate": 1.8112708127152838e-05, + "loss": 1.0114, + "step": 1314 + }, + { + "epoch": 0.22297583721916067, + "grad_norm": 0.9874269393789414, + "learning_rate": 1.8109495720575715e-05, + "loss": 1.0445, + "step": 1315 + }, + { + "epoch": 0.22314540059347182, + "grad_norm": 1.0589727994257228, + "learning_rate": 1.8106280867744727e-05, + "loss": 1.0504, + "step": 1316 + }, + { + "epoch": 0.22331496396778297, + "grad_norm": 1.080231520547102, + "learning_rate": 1.8103063569629635e-05, + "loss": 1.0528, + "step": 1317 + }, + { + "epoch": 0.22348452734209412, + "grad_norm": 0.9928448905011118, + "learning_rate": 1.809984382720095e-05, + "loss": 0.9939, + "step": 1318 + }, + { + "epoch": 0.22365409071640527, + "grad_norm": 1.0760900349238813, + "learning_rate": 1.809662164142992e-05, + "loss": 1.0273, + "step": 1319 + }, + { + "epoch": 0.22382365409071642, + "grad_norm": 1.0028974351579958, + "learning_rate": 1.809339701328852e-05, + "loss": 1.0063, + "step": 1320 + }, + { + "epoch": 0.22399321746502757, + "grad_norm": 0.9814947331591503, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.0708, + "step": 1321 + }, + { + "epoch": 0.22416278083933872, + "grad_norm": 1.0928768555830481, + "learning_rate": 1.808694043378624e-05, + "loss": 1.0246, + "step": 1322 + }, + { + "epoch": 0.22433234421364986, + "grad_norm": 1.0501688774526776, + "learning_rate": 1.8083708484373002e-05, + "loss": 0.9693, + "step": 1323 + }, + { + "epoch": 0.224501907587961, + "grad_norm": 1.0571847202803455, + "learning_rate": 1.8080474096484693e-05, + "loss": 1.0228, + "step": 1324 + }, + { + "epoch": 0.22467147096227216, + "grad_norm": 1.023996290343834, + "learning_rate": 1.8077237271096972e-05, + "loss": 1.0371, + "step": 1325 + }, + { + "epoch": 0.22484103433658328, + "grad_norm": 1.036482684607099, + "learning_rate": 1.807399800918624e-05, + "loss": 1.02, + "step": 1326 + }, + { + "epoch": 0.22501059771089443, + "grad_norm": 1.0179098496742172, + "learning_rate": 1.807075631172963e-05, + "loss": 1.0333, + "step": 1327 + }, + { + "epoch": 0.22518016108520558, + "grad_norm": 0.6523825174397321, + "learning_rate": 1.8067512179705008e-05, + "loss": 0.873, + "step": 1328 + }, + { + "epoch": 0.22534972445951673, + "grad_norm": 1.0628711185301398, + "learning_rate": 1.8064265614090976e-05, + "loss": 1.0229, + "step": 1329 + }, + { + "epoch": 0.22551928783382788, + "grad_norm": 1.0352947449986687, + "learning_rate": 1.806101661586687e-05, + "loss": 1.0221, + "step": 1330 + }, + { + "epoch": 0.22568885120813903, + "grad_norm": 1.041949899039187, + "learning_rate": 1.8057765186012765e-05, + "loss": 1.0135, + "step": 1331 + }, + { + "epoch": 0.22585841458245018, + "grad_norm": 1.0608133556327568, + "learning_rate": 1.805451132550946e-05, + "loss": 1.0358, + "step": 1332 + }, + { + "epoch": 0.22602797795676133, + "grad_norm": 1.0243662277600578, + "learning_rate": 1.8051255035338494e-05, + "loss": 1.0302, + "step": 1333 + }, + { + "epoch": 0.22619754133107248, + "grad_norm": 1.0159980682814356, + "learning_rate": 1.8047996316482134e-05, + "loss": 1.0256, + "step": 1334 + }, + { + "epoch": 0.22636710470538363, + "grad_norm": 1.0119958948997587, + "learning_rate": 1.8044735169923387e-05, + "loss": 1.0372, + "step": 1335 + }, + { + "epoch": 0.22653666807969478, + "grad_norm": 1.0327662774912698, + "learning_rate": 1.8041471596645984e-05, + "loss": 1.0046, + "step": 1336 + }, + { + "epoch": 0.22670623145400592, + "grad_norm": 1.0169170511905408, + "learning_rate": 1.8038205597634392e-05, + "loss": 1.027, + "step": 1337 + }, + { + "epoch": 0.22687579482831707, + "grad_norm": 1.075221359976595, + "learning_rate": 1.803493717387381e-05, + "loss": 1.0472, + "step": 1338 + }, + { + "epoch": 0.22704535820262822, + "grad_norm": 1.0320008243119638, + "learning_rate": 1.803166632635017e-05, + "loss": 1.0375, + "step": 1339 + }, + { + "epoch": 0.22721492157693937, + "grad_norm": 0.9968248621821412, + "learning_rate": 1.802839305605013e-05, + "loss": 1.0138, + "step": 1340 + }, + { + "epoch": 0.22738448495125052, + "grad_norm": 0.9996993934206426, + "learning_rate": 1.8025117363961083e-05, + "loss": 1.0422, + "step": 1341 + }, + { + "epoch": 0.22755404832556167, + "grad_norm": 0.9902024394693787, + "learning_rate": 1.802183925107115e-05, + "loss": 1.0144, + "step": 1342 + }, + { + "epoch": 0.22772361169987282, + "grad_norm": 0.98652821397121, + "learning_rate": 1.8018558718369187e-05, + "loss": 1.0327, + "step": 1343 + }, + { + "epoch": 0.22789317507418397, + "grad_norm": 1.0278006009194856, + "learning_rate": 1.8015275766844774e-05, + "loss": 1.0188, + "step": 1344 + }, + { + "epoch": 0.22806273844849512, + "grad_norm": 1.0169783606855654, + "learning_rate": 1.801199039748822e-05, + "loss": 1.0574, + "step": 1345 + }, + { + "epoch": 0.22823230182280627, + "grad_norm": 1.064677203366401, + "learning_rate": 1.8008702611290578e-05, + "loss": 1.0107, + "step": 1346 + }, + { + "epoch": 0.22840186519711742, + "grad_norm": 1.0395616526835498, + "learning_rate": 1.8005412409243604e-05, + "loss": 1.0228, + "step": 1347 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 1.017112070567201, + "learning_rate": 1.800211979233981e-05, + "loss": 0.9958, + "step": 1348 + }, + { + "epoch": 0.2287409919457397, + "grad_norm": 1.009005200988391, + "learning_rate": 1.7998824761572415e-05, + "loss": 1.0149, + "step": 1349 + }, + { + "epoch": 0.22891055532005086, + "grad_norm": 1.038513230379797, + "learning_rate": 1.799552731793538e-05, + "loss": 0.9939, + "step": 1350 + }, + { + "epoch": 0.229080118694362, + "grad_norm": 1.0290876338440214, + "learning_rate": 1.7992227462423385e-05, + "loss": 1.0653, + "step": 1351 + }, + { + "epoch": 0.22924968206867316, + "grad_norm": 1.0796137744913767, + "learning_rate": 1.7988925196031845e-05, + "loss": 1.0164, + "step": 1352 + }, + { + "epoch": 0.2294192454429843, + "grad_norm": 0.9998114488318148, + "learning_rate": 1.7985620519756897e-05, + "loss": 1.0528, + "step": 1353 + }, + { + "epoch": 0.22958880881729546, + "grad_norm": 0.6723301352068823, + "learning_rate": 1.7982313434595405e-05, + "loss": 0.8593, + "step": 1354 + }, + { + "epoch": 0.2297583721916066, + "grad_norm": 1.008352585274843, + "learning_rate": 1.7979003941544965e-05, + "loss": 1.0075, + "step": 1355 + }, + { + "epoch": 0.22992793556591776, + "grad_norm": 1.0688361465719443, + "learning_rate": 1.7975692041603893e-05, + "loss": 1.0256, + "step": 1356 + }, + { + "epoch": 0.2300974989402289, + "grad_norm": 1.035851941096092, + "learning_rate": 1.7972377735771234e-05, + "loss": 1.0351, + "step": 1357 + }, + { + "epoch": 0.23026706231454006, + "grad_norm": 0.9876668321277083, + "learning_rate": 1.7969061025046758e-05, + "loss": 1.0302, + "step": 1358 + }, + { + "epoch": 0.2304366256888512, + "grad_norm": 1.0301921163719239, + "learning_rate": 1.7965741910430963e-05, + "loss": 1.0382, + "step": 1359 + }, + { + "epoch": 0.23060618906316235, + "grad_norm": 1.0392955779400515, + "learning_rate": 1.7962420392925066e-05, + "loss": 1.0136, + "step": 1360 + }, + { + "epoch": 0.2307757524374735, + "grad_norm": 0.9853785816809418, + "learning_rate": 1.7959096473531016e-05, + "loss": 1.0114, + "step": 1361 + }, + { + "epoch": 0.23094531581178465, + "grad_norm": 1.0352520425664744, + "learning_rate": 1.7955770153251482e-05, + "loss": 1.0451, + "step": 1362 + }, + { + "epoch": 0.2311148791860958, + "grad_norm": 1.0478570736929989, + "learning_rate": 1.795244143308986e-05, + "loss": 1.029, + "step": 1363 + }, + { + "epoch": 0.23128444256040695, + "grad_norm": 1.0891369350528002, + "learning_rate": 1.7949110314050267e-05, + "loss": 1.0685, + "step": 1364 + }, + { + "epoch": 0.2314540059347181, + "grad_norm": 1.032754626337483, + "learning_rate": 1.7945776797137544e-05, + "loss": 1.0283, + "step": 1365 + }, + { + "epoch": 0.23162356930902925, + "grad_norm": 1.0519404009097395, + "learning_rate": 1.794244088335726e-05, + "loss": 1.0383, + "step": 1366 + }, + { + "epoch": 0.2317931326833404, + "grad_norm": 1.1037782436967822, + "learning_rate": 1.7939102573715698e-05, + "loss": 0.9956, + "step": 1367 + }, + { + "epoch": 0.23196269605765155, + "grad_norm": 1.005907467681158, + "learning_rate": 1.7935761869219876e-05, + "loss": 0.9945, + "step": 1368 + }, + { + "epoch": 0.2321322594319627, + "grad_norm": 1.0201308049072142, + "learning_rate": 1.7932418770877523e-05, + "loss": 1.0069, + "step": 1369 + }, + { + "epoch": 0.23230182280627384, + "grad_norm": 0.9565666578932192, + "learning_rate": 1.7929073279697096e-05, + "loss": 1.0133, + "step": 1370 + }, + { + "epoch": 0.232471386180585, + "grad_norm": 1.09555129926372, + "learning_rate": 1.792572539668777e-05, + "loss": 1.073, + "step": 1371 + }, + { + "epoch": 0.23264094955489614, + "grad_norm": 1.0701035107289367, + "learning_rate": 1.792237512285945e-05, + "loss": 1.0468, + "step": 1372 + }, + { + "epoch": 0.2328105129292073, + "grad_norm": 0.9909125772872428, + "learning_rate": 1.7919022459222754e-05, + "loss": 1.0301, + "step": 1373 + }, + { + "epoch": 0.23298007630351844, + "grad_norm": 1.0521920899966322, + "learning_rate": 1.7915667406789018e-05, + "loss": 1.0229, + "step": 1374 + }, + { + "epoch": 0.2331496396778296, + "grad_norm": 0.9992184122827568, + "learning_rate": 1.791230996657031e-05, + "loss": 1.0685, + "step": 1375 + }, + { + "epoch": 0.23331920305214074, + "grad_norm": 1.0648044025187282, + "learning_rate": 1.7908950139579406e-05, + "loss": 1.0525, + "step": 1376 + }, + { + "epoch": 0.2334887664264519, + "grad_norm": 1.0270826284903836, + "learning_rate": 1.7905587926829815e-05, + "loss": 1.006, + "step": 1377 + }, + { + "epoch": 0.23365832980076304, + "grad_norm": 0.994109347184484, + "learning_rate": 1.790222332933575e-05, + "loss": 1.015, + "step": 1378 + }, + { + "epoch": 0.2338278931750742, + "grad_norm": 1.0662296187363034, + "learning_rate": 1.7898856348112163e-05, + "loss": 1.0674, + "step": 1379 + }, + { + "epoch": 0.23399745654938534, + "grad_norm": 0.9902622887080254, + "learning_rate": 1.7895486984174707e-05, + "loss": 0.9842, + "step": 1380 + }, + { + "epoch": 0.23416701992369648, + "grad_norm": 1.0268383526807514, + "learning_rate": 1.7892115238539757e-05, + "loss": 1.0194, + "step": 1381 + }, + { + "epoch": 0.23433658329800763, + "grad_norm": 0.9469023642741146, + "learning_rate": 1.7888741112224418e-05, + "loss": 0.9809, + "step": 1382 + }, + { + "epoch": 0.23450614667231878, + "grad_norm": 1.017893828259254, + "learning_rate": 1.7885364606246503e-05, + "loss": 1.0395, + "step": 1383 + }, + { + "epoch": 0.23467571004662993, + "grad_norm": 0.9887055191344767, + "learning_rate": 1.7881985721624544e-05, + "loss": 1.0122, + "step": 1384 + }, + { + "epoch": 0.23484527342094108, + "grad_norm": 1.029189244899999, + "learning_rate": 1.7878604459377795e-05, + "loss": 1.0166, + "step": 1385 + }, + { + "epoch": 0.23501483679525223, + "grad_norm": 1.0404898021636972, + "learning_rate": 1.787522082052622e-05, + "loss": 1.0386, + "step": 1386 + }, + { + "epoch": 0.23518440016956338, + "grad_norm": 1.0061707856594195, + "learning_rate": 1.7871834806090502e-05, + "loss": 1.0598, + "step": 1387 + }, + { + "epoch": 0.23535396354387453, + "grad_norm": 1.0029441355436106, + "learning_rate": 1.786844641709205e-05, + "loss": 0.9995, + "step": 1388 + }, + { + "epoch": 0.23552352691818568, + "grad_norm": 1.0786000692928612, + "learning_rate": 1.7865055654552977e-05, + "loss": 1.0245, + "step": 1389 + }, + { + "epoch": 0.23569309029249683, + "grad_norm": 1.0870057639557795, + "learning_rate": 1.7861662519496116e-05, + "loss": 1.0327, + "step": 1390 + }, + { + "epoch": 0.23586265366680798, + "grad_norm": 1.019576780159488, + "learning_rate": 1.785826701294502e-05, + "loss": 1.0213, + "step": 1391 + }, + { + "epoch": 0.23603221704111912, + "grad_norm": 0.9620448442900429, + "learning_rate": 1.7854869135923946e-05, + "loss": 0.9859, + "step": 1392 + }, + { + "epoch": 0.23620178041543027, + "grad_norm": 1.0430154291142868, + "learning_rate": 1.7851468889457883e-05, + "loss": 1.0644, + "step": 1393 + }, + { + "epoch": 0.23637134378974142, + "grad_norm": 0.9963760284863703, + "learning_rate": 1.7848066274572523e-05, + "loss": 1.0362, + "step": 1394 + }, + { + "epoch": 0.23654090716405257, + "grad_norm": 0.9959007204573155, + "learning_rate": 1.7844661292294274e-05, + "loss": 0.9947, + "step": 1395 + }, + { + "epoch": 0.23671047053836372, + "grad_norm": 1.0801377559183405, + "learning_rate": 1.7841253943650258e-05, + "loss": 1.0365, + "step": 1396 + }, + { + "epoch": 0.23688003391267487, + "grad_norm": 1.0287693526888502, + "learning_rate": 1.7837844229668312e-05, + "loss": 1.0319, + "step": 1397 + }, + { + "epoch": 0.23704959728698602, + "grad_norm": 1.0454540554090133, + "learning_rate": 1.7834432151376992e-05, + "loss": 1.0404, + "step": 1398 + }, + { + "epoch": 0.23721916066129717, + "grad_norm": 1.0259811923709794, + "learning_rate": 1.7831017709805555e-05, + "loss": 1.0154, + "step": 1399 + }, + { + "epoch": 0.23738872403560832, + "grad_norm": 1.0695287937242937, + "learning_rate": 1.782760090598398e-05, + "loss": 1.0582, + "step": 1400 + }, + { + "epoch": 0.23755828740991947, + "grad_norm": 1.0273298841891763, + "learning_rate": 1.7824181740942958e-05, + "loss": 1.0099, + "step": 1401 + }, + { + "epoch": 0.23772785078423062, + "grad_norm": 1.0320346777060623, + "learning_rate": 1.7820760215713885e-05, + "loss": 1.0245, + "step": 1402 + }, + { + "epoch": 0.23789741415854176, + "grad_norm": 1.008865581468882, + "learning_rate": 1.7817336331328882e-05, + "loss": 1.0403, + "step": 1403 + }, + { + "epoch": 0.2380669775328529, + "grad_norm": 1.124825571650674, + "learning_rate": 1.781391008882077e-05, + "loss": 1.0763, + "step": 1404 + }, + { + "epoch": 0.23823654090716406, + "grad_norm": 0.9931723201014185, + "learning_rate": 1.7810481489223082e-05, + "loss": 1.0275, + "step": 1405 + }, + { + "epoch": 0.2384061042814752, + "grad_norm": 1.048460249199628, + "learning_rate": 1.7807050533570073e-05, + "loss": 1.0369, + "step": 1406 + }, + { + "epoch": 0.23857566765578636, + "grad_norm": 1.0509585873036642, + "learning_rate": 1.7803617222896696e-05, + "loss": 1.0562, + "step": 1407 + }, + { + "epoch": 0.2387452310300975, + "grad_norm": 0.9601662419015582, + "learning_rate": 1.780018155823862e-05, + "loss": 1.023, + "step": 1408 + }, + { + "epoch": 0.23891479440440866, + "grad_norm": 1.026072506803247, + "learning_rate": 1.7796743540632226e-05, + "loss": 1.0418, + "step": 1409 + }, + { + "epoch": 0.2390843577787198, + "grad_norm": 0.9951475157202543, + "learning_rate": 1.7793303171114597e-05, + "loss": 1.0361, + "step": 1410 + }, + { + "epoch": 0.23925392115303096, + "grad_norm": 0.9598733138895736, + "learning_rate": 1.778986045072354e-05, + "loss": 1.0064, + "step": 1411 + }, + { + "epoch": 0.2394234845273421, + "grad_norm": 0.9629564424867945, + "learning_rate": 1.778641538049755e-05, + "loss": 0.9569, + "step": 1412 + }, + { + "epoch": 0.23959304790165326, + "grad_norm": 1.0512995422784355, + "learning_rate": 1.7782967961475855e-05, + "loss": 1.0509, + "step": 1413 + }, + { + "epoch": 0.2397626112759644, + "grad_norm": 1.0356932174970064, + "learning_rate": 1.7779518194698374e-05, + "loss": 1.0327, + "step": 1414 + }, + { + "epoch": 0.23993217465027555, + "grad_norm": 1.0226000830433215, + "learning_rate": 1.7776066081205738e-05, + "loss": 1.0375, + "step": 1415 + }, + { + "epoch": 0.2401017380245867, + "grad_norm": 0.9807925229585885, + "learning_rate": 1.777261162203929e-05, + "loss": 0.9846, + "step": 1416 + }, + { + "epoch": 0.24027130139889785, + "grad_norm": 1.0554528099124916, + "learning_rate": 1.776915481824107e-05, + "loss": 1.0207, + "step": 1417 + }, + { + "epoch": 0.24044086477320897, + "grad_norm": 1.0437309139656565, + "learning_rate": 1.776569567085385e-05, + "loss": 1.0325, + "step": 1418 + }, + { + "epoch": 0.24061042814752012, + "grad_norm": 0.9785323164536257, + "learning_rate": 1.7762234180921078e-05, + "loss": 1.0016, + "step": 1419 + }, + { + "epoch": 0.24077999152183127, + "grad_norm": 0.9638615457405866, + "learning_rate": 1.7758770349486924e-05, + "loss": 0.9759, + "step": 1420 + }, + { + "epoch": 0.24094955489614242, + "grad_norm": 1.0567298938845457, + "learning_rate": 1.775530417759627e-05, + "loss": 0.9877, + "step": 1421 + }, + { + "epoch": 0.24111911827045357, + "grad_norm": 1.0627658698166225, + "learning_rate": 1.7751835666294694e-05, + "loss": 0.9881, + "step": 1422 + }, + { + "epoch": 0.24128868164476472, + "grad_norm": 1.0592558454321745, + "learning_rate": 1.7748364816628482e-05, + "loss": 1.0426, + "step": 1423 + }, + { + "epoch": 0.24145824501907587, + "grad_norm": 0.9883069952704082, + "learning_rate": 1.7744891629644627e-05, + "loss": 1.0033, + "step": 1424 + }, + { + "epoch": 0.24162780839338702, + "grad_norm": 1.0812288136016455, + "learning_rate": 1.7741416106390828e-05, + "loss": 1.0382, + "step": 1425 + }, + { + "epoch": 0.24179737176769817, + "grad_norm": 1.1473853342550038, + "learning_rate": 1.773793824791548e-05, + "loss": 1.0545, + "step": 1426 + }, + { + "epoch": 0.24196693514200932, + "grad_norm": 1.0549135788350918, + "learning_rate": 1.77344580552677e-05, + "loss": 1.0609, + "step": 1427 + }, + { + "epoch": 0.24213649851632046, + "grad_norm": 1.0080150661065357, + "learning_rate": 1.7730975529497292e-05, + "loss": 1.033, + "step": 1428 + }, + { + "epoch": 0.2423060618906316, + "grad_norm": 1.0346463488518975, + "learning_rate": 1.772749067165477e-05, + "loss": 1.0938, + "step": 1429 + }, + { + "epoch": 0.24247562526494276, + "grad_norm": 1.0742667851191425, + "learning_rate": 1.7724003482791358e-05, + "loss": 1.0572, + "step": 1430 + }, + { + "epoch": 0.2426451886392539, + "grad_norm": 1.0007468586433665, + "learning_rate": 1.772051396395897e-05, + "loss": 1.0146, + "step": 1431 + }, + { + "epoch": 0.24281475201356506, + "grad_norm": 0.9788043975865908, + "learning_rate": 1.7717022116210234e-05, + "loss": 1.0153, + "step": 1432 + }, + { + "epoch": 0.2429843153878762, + "grad_norm": 1.0664870046562671, + "learning_rate": 1.7713527940598473e-05, + "loss": 1.0259, + "step": 1433 + }, + { + "epoch": 0.24315387876218736, + "grad_norm": 1.053225021916283, + "learning_rate": 1.771003143817772e-05, + "loss": 1.0231, + "step": 1434 + }, + { + "epoch": 0.2433234421364985, + "grad_norm": 1.0747732929688687, + "learning_rate": 1.77065326100027e-05, + "loss": 1.046, + "step": 1435 + }, + { + "epoch": 0.24349300551080966, + "grad_norm": 1.0289159349723926, + "learning_rate": 1.7703031457128853e-05, + "loss": 1.0268, + "step": 1436 + }, + { + "epoch": 0.2436625688851208, + "grad_norm": 1.0307347972055092, + "learning_rate": 1.7699527980612306e-05, + "loss": 1.0333, + "step": 1437 + }, + { + "epoch": 0.24383213225943196, + "grad_norm": 1.05095360333709, + "learning_rate": 1.7696022181509892e-05, + "loss": 0.9914, + "step": 1438 + }, + { + "epoch": 0.2440016956337431, + "grad_norm": 1.0181522908961647, + "learning_rate": 1.769251406087915e-05, + "loss": 1.0537, + "step": 1439 + }, + { + "epoch": 0.24417125900805425, + "grad_norm": 1.0404669561076565, + "learning_rate": 1.768900361977832e-05, + "loss": 1.0421, + "step": 1440 + }, + { + "epoch": 0.2443408223823654, + "grad_norm": 0.9876662088756392, + "learning_rate": 1.7685490859266324e-05, + "loss": 1.008, + "step": 1441 + }, + { + "epoch": 0.24451038575667655, + "grad_norm": 0.653233955255012, + "learning_rate": 1.7681975780402807e-05, + "loss": 0.8767, + "step": 1442 + }, + { + "epoch": 0.2446799491309877, + "grad_norm": 0.9906944457386946, + "learning_rate": 1.76784583842481e-05, + "loss": 1.033, + "step": 1443 + }, + { + "epoch": 0.24484951250529885, + "grad_norm": 0.9839244572378357, + "learning_rate": 1.7674938671863237e-05, + "loss": 1.0132, + "step": 1444 + }, + { + "epoch": 0.24501907587961, + "grad_norm": 1.0107933846377484, + "learning_rate": 1.7671416644309945e-05, + "loss": 1.0077, + "step": 1445 + }, + { + "epoch": 0.24518863925392115, + "grad_norm": 0.9952062322872111, + "learning_rate": 1.7667892302650665e-05, + "loss": 1.0243, + "step": 1446 + }, + { + "epoch": 0.2453582026282323, + "grad_norm": 0.9775603072340857, + "learning_rate": 1.7664365647948513e-05, + "loss": 1.0094, + "step": 1447 + }, + { + "epoch": 0.24552776600254345, + "grad_norm": 0.9816259213997773, + "learning_rate": 1.7660836681267323e-05, + "loss": 1.0049, + "step": 1448 + }, + { + "epoch": 0.2456973293768546, + "grad_norm": 1.0064737928710894, + "learning_rate": 1.7657305403671618e-05, + "loss": 1.0466, + "step": 1449 + }, + { + "epoch": 0.24586689275116574, + "grad_norm": 1.016947092204689, + "learning_rate": 1.7653771816226614e-05, + "loss": 1.0575, + "step": 1450 + }, + { + "epoch": 0.2460364561254769, + "grad_norm": 1.0926432461997742, + "learning_rate": 1.7650235919998234e-05, + "loss": 1.0041, + "step": 1451 + }, + { + "epoch": 0.24620601949978804, + "grad_norm": 1.0103729197371947, + "learning_rate": 1.764669771605309e-05, + "loss": 1.0177, + "step": 1452 + }, + { + "epoch": 0.2463755828740992, + "grad_norm": 1.0125724351885979, + "learning_rate": 1.7643157205458483e-05, + "loss": 1.0049, + "step": 1453 + }, + { + "epoch": 0.24654514624841034, + "grad_norm": 1.0062474149891296, + "learning_rate": 1.7639614389282432e-05, + "loss": 1.0237, + "step": 1454 + }, + { + "epoch": 0.2467147096227215, + "grad_norm": 1.0130670679939415, + "learning_rate": 1.7636069268593633e-05, + "loss": 0.9878, + "step": 1455 + }, + { + "epoch": 0.24688427299703264, + "grad_norm": 1.0288671551407782, + "learning_rate": 1.7632521844461482e-05, + "loss": 1.0209, + "step": 1456 + }, + { + "epoch": 0.2470538363713438, + "grad_norm": 1.0577760257963837, + "learning_rate": 1.762897211795607e-05, + "loss": 1.0557, + "step": 1457 + }, + { + "epoch": 0.24722339974565494, + "grad_norm": 0.9963844921034853, + "learning_rate": 1.7625420090148182e-05, + "loss": 1.0217, + "step": 1458 + }, + { + "epoch": 0.2473929631199661, + "grad_norm": 1.0954657623631543, + "learning_rate": 1.76218657621093e-05, + "loss": 1.0305, + "step": 1459 + }, + { + "epoch": 0.24756252649427724, + "grad_norm": 1.0921685324811647, + "learning_rate": 1.7618309134911594e-05, + "loss": 0.9985, + "step": 1460 + }, + { + "epoch": 0.24773208986858838, + "grad_norm": 0.9821690248562863, + "learning_rate": 1.7614750209627938e-05, + "loss": 1.023, + "step": 1461 + }, + { + "epoch": 0.24790165324289953, + "grad_norm": 0.9533657952772474, + "learning_rate": 1.7611188987331885e-05, + "loss": 1.0148, + "step": 1462 + }, + { + "epoch": 0.24807121661721068, + "grad_norm": 0.6797986381439817, + "learning_rate": 1.7607625469097697e-05, + "loss": 0.8792, + "step": 1463 + }, + { + "epoch": 0.24824077999152183, + "grad_norm": 1.097785575309812, + "learning_rate": 1.7604059656000313e-05, + "loss": 1.0051, + "step": 1464 + }, + { + "epoch": 0.24841034336583298, + "grad_norm": 1.0323510046183746, + "learning_rate": 1.760049154911537e-05, + "loss": 1.0202, + "step": 1465 + }, + { + "epoch": 0.24857990674014413, + "grad_norm": 0.6110246269673387, + "learning_rate": 1.7596921149519203e-05, + "loss": 0.8005, + "step": 1466 + }, + { + "epoch": 0.24874947011445528, + "grad_norm": 0.5979707859141831, + "learning_rate": 1.7593348458288834e-05, + "loss": 0.8097, + "step": 1467 + }, + { + "epoch": 0.24891903348876643, + "grad_norm": 1.0474172632435463, + "learning_rate": 1.7589773476501974e-05, + "loss": 1.0267, + "step": 1468 + }, + { + "epoch": 0.24908859686307758, + "grad_norm": 1.0086100453838633, + "learning_rate": 1.758619620523703e-05, + "loss": 1.0472, + "step": 1469 + }, + { + "epoch": 0.24925816023738873, + "grad_norm": 0.9952254966437639, + "learning_rate": 1.758261664557309e-05, + "loss": 1.0301, + "step": 1470 + }, + { + "epoch": 0.24942772361169988, + "grad_norm": 0.963171841339179, + "learning_rate": 1.7579034798589942e-05, + "loss": 0.9986, + "step": 1471 + }, + { + "epoch": 0.24959728698601102, + "grad_norm": 1.044690296682734, + "learning_rate": 1.7575450665368068e-05, + "loss": 1.0456, + "step": 1472 + }, + { + "epoch": 0.24976685036032217, + "grad_norm": 1.0244138080273024, + "learning_rate": 1.7571864246988623e-05, + "loss": 1.0053, + "step": 1473 + }, + { + "epoch": 0.24993641373463332, + "grad_norm": 0.9706876942255614, + "learning_rate": 1.7568275544533464e-05, + "loss": 0.9923, + "step": 1474 + }, + { + "epoch": 0.25010597710894444, + "grad_norm": 0.6339118111809177, + "learning_rate": 1.7564684559085138e-05, + "loss": 0.8381, + "step": 1475 + }, + { + "epoch": 0.2502755404832556, + "grad_norm": 1.0925280850352308, + "learning_rate": 1.7561091291726867e-05, + "loss": 1.012, + "step": 1476 + }, + { + "epoch": 0.25044510385756674, + "grad_norm": 1.0441971739600981, + "learning_rate": 1.7557495743542586e-05, + "loss": 1.0229, + "step": 1477 + }, + { + "epoch": 0.2506146672318779, + "grad_norm": 1.0074490589085838, + "learning_rate": 1.755389791561689e-05, + "loss": 1.0254, + "step": 1478 + }, + { + "epoch": 0.25078423060618904, + "grad_norm": 1.1152970465612853, + "learning_rate": 1.755029780903508e-05, + "loss": 1.0299, + "step": 1479 + }, + { + "epoch": 0.2509537939805002, + "grad_norm": 1.091375366635331, + "learning_rate": 1.7546695424883133e-05, + "loss": 1.0269, + "step": 1480 + }, + { + "epoch": 0.25112335735481134, + "grad_norm": 0.7059197033064152, + "learning_rate": 1.7543090764247726e-05, + "loss": 0.8515, + "step": 1481 + }, + { + "epoch": 0.2512929207291225, + "grad_norm": 1.0165101370600038, + "learning_rate": 1.7539483828216216e-05, + "loss": 1.0314, + "step": 1482 + }, + { + "epoch": 0.25146248410343364, + "grad_norm": 1.032202986974254, + "learning_rate": 1.753587461787664e-05, + "loss": 1.0346, + "step": 1483 + }, + { + "epoch": 0.2516320474777448, + "grad_norm": 1.0576941696371032, + "learning_rate": 1.7532263134317735e-05, + "loss": 0.9761, + "step": 1484 + }, + { + "epoch": 0.25180161085205593, + "grad_norm": 0.9930654952122822, + "learning_rate": 1.7528649378628912e-05, + "loss": 1.0199, + "step": 1485 + }, + { + "epoch": 0.2519711742263671, + "grad_norm": 1.0162042188951854, + "learning_rate": 1.7525033351900268e-05, + "loss": 1.0651, + "step": 1486 + }, + { + "epoch": 0.25214073760067823, + "grad_norm": 1.0107672579984182, + "learning_rate": 1.7521415055222593e-05, + "loss": 1.0109, + "step": 1487 + }, + { + "epoch": 0.2523103009749894, + "grad_norm": 1.0156062703495796, + "learning_rate": 1.7517794489687355e-05, + "loss": 1.014, + "step": 1488 + }, + { + "epoch": 0.25247986434930053, + "grad_norm": 0.675215900675686, + "learning_rate": 1.751417165638671e-05, + "loss": 0.8449, + "step": 1489 + }, + { + "epoch": 0.2526494277236117, + "grad_norm": 1.0478988523805661, + "learning_rate": 1.75105465564135e-05, + "loss": 1.0321, + "step": 1490 + }, + { + "epoch": 0.25281899109792283, + "grad_norm": 0.7120965949161628, + "learning_rate": 1.7506919190861238e-05, + "loss": 0.9697, + "step": 1491 + }, + { + "epoch": 0.252988554472234, + "grad_norm": 1.1155422342885284, + "learning_rate": 1.7503289560824135e-05, + "loss": 1.0334, + "step": 1492 + }, + { + "epoch": 0.2531581178465451, + "grad_norm": 1.0764515352550739, + "learning_rate": 1.7499657667397083e-05, + "loss": 1.04, + "step": 1493 + }, + { + "epoch": 0.2533276812208563, + "grad_norm": 1.040372474806951, + "learning_rate": 1.749602351167565e-05, + "loss": 0.9859, + "step": 1494 + }, + { + "epoch": 0.2534972445951674, + "grad_norm": 1.1019232397348715, + "learning_rate": 1.7492387094756088e-05, + "loss": 1.0431, + "step": 1495 + }, + { + "epoch": 0.2536668079694786, + "grad_norm": 0.9934189371839818, + "learning_rate": 1.7488748417735334e-05, + "loss": 1.0016, + "step": 1496 + }, + { + "epoch": 0.2538363713437897, + "grad_norm": 1.0281861166727788, + "learning_rate": 1.7485107481711014e-05, + "loss": 1.0147, + "step": 1497 + }, + { + "epoch": 0.2540059347181009, + "grad_norm": 1.0050445262239223, + "learning_rate": 1.7481464287781416e-05, + "loss": 0.9724, + "step": 1498 + }, + { + "epoch": 0.254175498092412, + "grad_norm": 1.0193714164544145, + "learning_rate": 1.7477818837045527e-05, + "loss": 1.0081, + "step": 1499 + }, + { + "epoch": 0.2543450614667232, + "grad_norm": 0.9632133499931924, + "learning_rate": 1.7474171130603007e-05, + "loss": 1.0197, + "step": 1500 + }, + { + "epoch": 0.2545146248410343, + "grad_norm": 0.9946302411895425, + "learning_rate": 1.7470521169554196e-05, + "loss": 1.0214, + "step": 1501 + }, + { + "epoch": 0.2546841882153455, + "grad_norm": 1.010551942615164, + "learning_rate": 1.7466868955000117e-05, + "loss": 1.0456, + "step": 1502 + }, + { + "epoch": 0.2548537515896566, + "grad_norm": 1.0323430033612593, + "learning_rate": 1.7463214488042472e-05, + "loss": 1.0107, + "step": 1503 + }, + { + "epoch": 0.2550233149639678, + "grad_norm": 1.0022888503923253, + "learning_rate": 1.745955776978364e-05, + "loss": 1.0061, + "step": 1504 + }, + { + "epoch": 0.2551928783382789, + "grad_norm": 0.9922662665684321, + "learning_rate": 1.7455898801326685e-05, + "loss": 1.032, + "step": 1505 + }, + { + "epoch": 0.2553624417125901, + "grad_norm": 1.034503521327479, + "learning_rate": 1.7452237583775344e-05, + "loss": 0.9921, + "step": 1506 + }, + { + "epoch": 0.2555320050869012, + "grad_norm": 1.0147133041475178, + "learning_rate": 1.7448574118234032e-05, + "loss": 0.9865, + "step": 1507 + }, + { + "epoch": 0.2557015684612124, + "grad_norm": 1.0360809115409286, + "learning_rate": 1.7444908405807845e-05, + "loss": 1.068, + "step": 1508 + }, + { + "epoch": 0.2558711318355235, + "grad_norm": 1.0173743081161042, + "learning_rate": 1.7441240447602565e-05, + "loss": 1.0674, + "step": 1509 + }, + { + "epoch": 0.2560406952098347, + "grad_norm": 0.9952798221689829, + "learning_rate": 1.7437570244724625e-05, + "loss": 1.0165, + "step": 1510 + }, + { + "epoch": 0.2562102585841458, + "grad_norm": 1.0105035368109276, + "learning_rate": 1.743389779828117e-05, + "loss": 1.0274, + "step": 1511 + }, + { + "epoch": 0.256379821958457, + "grad_norm": 1.0025947852506194, + "learning_rate": 1.7430223109379995e-05, + "loss": 0.9871, + "step": 1512 + }, + { + "epoch": 0.2565493853327681, + "grad_norm": 0.9602305372254146, + "learning_rate": 1.742654617912958e-05, + "loss": 0.9913, + "step": 1513 + }, + { + "epoch": 0.2567189487070793, + "grad_norm": 0.9220114231493487, + "learning_rate": 1.7422867008639094e-05, + "loss": 0.9976, + "step": 1514 + }, + { + "epoch": 0.2568885120813904, + "grad_norm": 1.0249573837742179, + "learning_rate": 1.7419185599018356e-05, + "loss": 1.0361, + "step": 1515 + }, + { + "epoch": 0.2570580754557016, + "grad_norm": 0.9551299605125825, + "learning_rate": 1.741550195137788e-05, + "loss": 0.9928, + "step": 1516 + }, + { + "epoch": 0.2572276388300127, + "grad_norm": 1.0260119822193878, + "learning_rate": 1.7411816066828852e-05, + "loss": 1.0083, + "step": 1517 + }, + { + "epoch": 0.2573972022043239, + "grad_norm": 0.9683669860935937, + "learning_rate": 1.7408127946483127e-05, + "loss": 0.9782, + "step": 1518 + }, + { + "epoch": 0.257566765578635, + "grad_norm": 1.006290752282903, + "learning_rate": 1.7404437591453237e-05, + "loss": 0.9743, + "step": 1519 + }, + { + "epoch": 0.2577363289529462, + "grad_norm": 1.0454112580276205, + "learning_rate": 1.7400745002852388e-05, + "loss": 1.0318, + "step": 1520 + }, + { + "epoch": 0.2579058923272573, + "grad_norm": 1.0113736514684712, + "learning_rate": 1.7397050181794463e-05, + "loss": 0.9929, + "step": 1521 + }, + { + "epoch": 0.2580754557015685, + "grad_norm": 0.9911816544299251, + "learning_rate": 1.7393353129394017e-05, + "loss": 1.0263, + "step": 1522 + }, + { + "epoch": 0.2582450190758796, + "grad_norm": 0.9713589517498729, + "learning_rate": 1.7389653846766276e-05, + "loss": 1.0229, + "step": 1523 + }, + { + "epoch": 0.2584145824501908, + "grad_norm": 0.9713910416200988, + "learning_rate": 1.7385952335027136e-05, + "loss": 1.0184, + "step": 1524 + }, + { + "epoch": 0.2585841458245019, + "grad_norm": 1.0216158997121034, + "learning_rate": 1.7382248595293175e-05, + "loss": 1.0003, + "step": 1525 + }, + { + "epoch": 0.2587537091988131, + "grad_norm": 1.0389125520173255, + "learning_rate": 1.7378542628681634e-05, + "loss": 1.0148, + "step": 1526 + }, + { + "epoch": 0.2589232725731242, + "grad_norm": 0.9773163912815696, + "learning_rate": 1.7374834436310427e-05, + "loss": 1.039, + "step": 1527 + }, + { + "epoch": 0.2590928359474354, + "grad_norm": 1.0232424540151142, + "learning_rate": 1.7371124019298148e-05, + "loss": 0.994, + "step": 1528 + }, + { + "epoch": 0.2592623993217465, + "grad_norm": 1.0103535358616882, + "learning_rate": 1.736741137876405e-05, + "loss": 1.0279, + "step": 1529 + }, + { + "epoch": 0.25943196269605767, + "grad_norm": 1.0634746851039283, + "learning_rate": 1.7363696515828062e-05, + "loss": 1.0381, + "step": 1530 + }, + { + "epoch": 0.2596015260703688, + "grad_norm": 1.1023515103501251, + "learning_rate": 1.735997943161079e-05, + "loss": 1.0333, + "step": 1531 + }, + { + "epoch": 0.25977108944467997, + "grad_norm": 1.0263854678361861, + "learning_rate": 1.7356260127233496e-05, + "loss": 1.0029, + "step": 1532 + }, + { + "epoch": 0.2599406528189911, + "grad_norm": 1.032283138861617, + "learning_rate": 1.7352538603818124e-05, + "loss": 1.0434, + "step": 1533 + }, + { + "epoch": 0.26011021619330227, + "grad_norm": 1.0337668282224568, + "learning_rate": 1.7348814862487277e-05, + "loss": 1.0128, + "step": 1534 + }, + { + "epoch": 0.2602797795676134, + "grad_norm": 1.051583157278923, + "learning_rate": 1.734508890436424e-05, + "loss": 1.0444, + "step": 1535 + }, + { + "epoch": 0.26044934294192457, + "grad_norm": 1.0070075004451315, + "learning_rate": 1.7341360730572958e-05, + "loss": 1.0175, + "step": 1536 + }, + { + "epoch": 0.2606189063162357, + "grad_norm": 1.0494492020041937, + "learning_rate": 1.733763034223804e-05, + "loss": 1.0363, + "step": 1537 + }, + { + "epoch": 0.26078846969054686, + "grad_norm": 1.041464603137685, + "learning_rate": 1.7333897740484776e-05, + "loss": 1.0528, + "step": 1538 + }, + { + "epoch": 0.260958033064858, + "grad_norm": 0.9999454640958684, + "learning_rate": 1.7330162926439116e-05, + "loss": 1.0224, + "step": 1539 + }, + { + "epoch": 0.26112759643916916, + "grad_norm": 0.9660982024683389, + "learning_rate": 1.7326425901227676e-05, + "loss": 1.0122, + "step": 1540 + }, + { + "epoch": 0.2612971598134803, + "grad_norm": 1.1010065043813138, + "learning_rate": 1.7322686665977738e-05, + "loss": 0.9959, + "step": 1541 + }, + { + "epoch": 0.26146672318779146, + "grad_norm": 1.0780044340378458, + "learning_rate": 1.7318945221817255e-05, + "loss": 1.0017, + "step": 1542 + }, + { + "epoch": 0.2616362865621026, + "grad_norm": 0.9885453639924419, + "learning_rate": 1.731520156987485e-05, + "loss": 0.9975, + "step": 1543 + }, + { + "epoch": 0.26180584993641376, + "grad_norm": 1.0052077950583855, + "learning_rate": 1.7311455711279802e-05, + "loss": 0.9835, + "step": 1544 + }, + { + "epoch": 0.2619754133107249, + "grad_norm": 1.0300357419040997, + "learning_rate": 1.730770764716206e-05, + "loss": 1.015, + "step": 1545 + }, + { + "epoch": 0.26214497668503606, + "grad_norm": 1.1119620555831413, + "learning_rate": 1.7303957378652243e-05, + "loss": 1.0642, + "step": 1546 + }, + { + "epoch": 0.2623145400593472, + "grad_norm": 1.062119910039204, + "learning_rate": 1.7300204906881627e-05, + "loss": 1.0119, + "step": 1547 + }, + { + "epoch": 0.26248410343365836, + "grad_norm": 1.0116705173235143, + "learning_rate": 1.729645023298216e-05, + "loss": 1.0106, + "step": 1548 + }, + { + "epoch": 0.2626536668079695, + "grad_norm": 1.062110353745387, + "learning_rate": 1.7292693358086447e-05, + "loss": 1.0398, + "step": 1549 + }, + { + "epoch": 0.26282323018228065, + "grad_norm": 1.0356213192297952, + "learning_rate": 1.7288934283327763e-05, + "loss": 1.0612, + "step": 1550 + }, + { + "epoch": 0.2629927935565918, + "grad_norm": 0.9361254533742164, + "learning_rate": 1.728517300984004e-05, + "loss": 0.9991, + "step": 1551 + }, + { + "epoch": 0.26316235693090295, + "grad_norm": 0.9719160209427214, + "learning_rate": 1.7281409538757886e-05, + "loss": 0.9924, + "step": 1552 + }, + { + "epoch": 0.2633319203052141, + "grad_norm": 1.010106484642722, + "learning_rate": 1.7277643871216558e-05, + "loss": 1.0301, + "step": 1553 + }, + { + "epoch": 0.26350148367952525, + "grad_norm": 0.9845021914368712, + "learning_rate": 1.7273876008351977e-05, + "loss": 1.0531, + "step": 1554 + }, + { + "epoch": 0.26367104705383637, + "grad_norm": 0.9573328337978327, + "learning_rate": 1.727010595130074e-05, + "loss": 0.9875, + "step": 1555 + }, + { + "epoch": 0.2638406104281475, + "grad_norm": 1.1090604910368012, + "learning_rate": 1.7266333701200086e-05, + "loss": 1.037, + "step": 1556 + }, + { + "epoch": 0.26401017380245867, + "grad_norm": 0.9881677734299702, + "learning_rate": 1.7262559259187936e-05, + "loss": 1.0133, + "step": 1557 + }, + { + "epoch": 0.2641797371767698, + "grad_norm": 1.0336625780449076, + "learning_rate": 1.725878262640285e-05, + "loss": 1.0082, + "step": 1558 + }, + { + "epoch": 0.26434930055108097, + "grad_norm": 1.0039999420029906, + "learning_rate": 1.725500380398407e-05, + "loss": 1.0004, + "step": 1559 + }, + { + "epoch": 0.2645188639253921, + "grad_norm": 0.9908248485760892, + "learning_rate": 1.7251222793071485e-05, + "loss": 1.0167, + "step": 1560 + }, + { + "epoch": 0.26468842729970327, + "grad_norm": 1.1099042284300633, + "learning_rate": 1.724743959480565e-05, + "loss": 1.0378, + "step": 1561 + }, + { + "epoch": 0.2648579906740144, + "grad_norm": 0.6886615064716238, + "learning_rate": 1.724365421032778e-05, + "loss": 0.8823, + "step": 1562 + }, + { + "epoch": 0.26502755404832556, + "grad_norm": 0.9599530376455806, + "learning_rate": 1.7239866640779745e-05, + "loss": 1.043, + "step": 1563 + }, + { + "epoch": 0.2651971174226367, + "grad_norm": 0.9675491895367597, + "learning_rate": 1.7236076887304075e-05, + "loss": 1.0347, + "step": 1564 + }, + { + "epoch": 0.26536668079694786, + "grad_norm": 1.0018629844462017, + "learning_rate": 1.7232284951043962e-05, + "loss": 1.0015, + "step": 1565 + }, + { + "epoch": 0.265536244171259, + "grad_norm": 1.0258476398592737, + "learning_rate": 1.722849083314326e-05, + "loss": 1.0015, + "step": 1566 + }, + { + "epoch": 0.26570580754557016, + "grad_norm": 0.9790221675134905, + "learning_rate": 1.7224694534746467e-05, + "loss": 1.0003, + "step": 1567 + }, + { + "epoch": 0.2658753709198813, + "grad_norm": 0.9529917892672408, + "learning_rate": 1.7220896056998753e-05, + "loss": 1.0217, + "step": 1568 + }, + { + "epoch": 0.26604493429419246, + "grad_norm": 0.6736902867314191, + "learning_rate": 1.721709540104594e-05, + "loss": 0.8791, + "step": 1569 + }, + { + "epoch": 0.2662144976685036, + "grad_norm": 1.0025373415041368, + "learning_rate": 1.721329256803451e-05, + "loss": 1.0324, + "step": 1570 + }, + { + "epoch": 0.26638406104281476, + "grad_norm": 1.015766210954091, + "learning_rate": 1.7209487559111594e-05, + "loss": 1.0135, + "step": 1571 + }, + { + "epoch": 0.2665536244171259, + "grad_norm": 1.0229792170867233, + "learning_rate": 1.7205680375424988e-05, + "loss": 1.0212, + "step": 1572 + }, + { + "epoch": 0.26672318779143706, + "grad_norm": 0.999673508818176, + "learning_rate": 1.720187101812314e-05, + "loss": 0.9858, + "step": 1573 + }, + { + "epoch": 0.2668927511657482, + "grad_norm": 0.9760398731966328, + "learning_rate": 1.7198059488355153e-05, + "loss": 0.9979, + "step": 1574 + }, + { + "epoch": 0.26706231454005935, + "grad_norm": 0.9673388895471001, + "learning_rate": 1.7194245787270784e-05, + "loss": 1.0014, + "step": 1575 + }, + { + "epoch": 0.2672318779143705, + "grad_norm": 1.0351461126249129, + "learning_rate": 1.7190429916020454e-05, + "loss": 1.0154, + "step": 1576 + }, + { + "epoch": 0.26740144128868165, + "grad_norm": 0.993090040058639, + "learning_rate": 1.7186611875755227e-05, + "loss": 1.0405, + "step": 1577 + }, + { + "epoch": 0.2675710046629928, + "grad_norm": 1.0531473085392273, + "learning_rate": 1.718279166762683e-05, + "loss": 1.058, + "step": 1578 + }, + { + "epoch": 0.26774056803730395, + "grad_norm": 1.0208249971659966, + "learning_rate": 1.7178969292787632e-05, + "loss": 1.0288, + "step": 1579 + }, + { + "epoch": 0.26791013141161507, + "grad_norm": 0.9552184269072861, + "learning_rate": 1.7175144752390674e-05, + "loss": 0.9774, + "step": 1580 + }, + { + "epoch": 0.26807969478592625, + "grad_norm": 0.9627056098126374, + "learning_rate": 1.7171318047589637e-05, + "loss": 0.947, + "step": 1581 + }, + { + "epoch": 0.26824925816023737, + "grad_norm": 1.0270390026434155, + "learning_rate": 1.7167489179538856e-05, + "loss": 1.0256, + "step": 1582 + }, + { + "epoch": 0.26841882153454855, + "grad_norm": 1.0397651037964861, + "learning_rate": 1.7163658149393323e-05, + "loss": 1.0232, + "step": 1583 + }, + { + "epoch": 0.26858838490885967, + "grad_norm": 0.9789880341364536, + "learning_rate": 1.7159824958308675e-05, + "loss": 0.9756, + "step": 1584 + }, + { + "epoch": 0.26875794828317084, + "grad_norm": 0.990464094831154, + "learning_rate": 1.715598960744121e-05, + "loss": 1.0302, + "step": 1585 + }, + { + "epoch": 0.26892751165748197, + "grad_norm": 1.0768583125341327, + "learning_rate": 1.7152152097947875e-05, + "loss": 1.0339, + "step": 1586 + }, + { + "epoch": 0.26909707503179314, + "grad_norm": 1.0075401698259927, + "learning_rate": 1.7148312430986263e-05, + "loss": 1.0065, + "step": 1587 + }, + { + "epoch": 0.26926663840610426, + "grad_norm": 1.042959535433281, + "learning_rate": 1.7144470607714626e-05, + "loss": 1.034, + "step": 1588 + }, + { + "epoch": 0.26943620178041544, + "grad_norm": 1.096226383427998, + "learning_rate": 1.7140626629291853e-05, + "loss": 1.0441, + "step": 1589 + }, + { + "epoch": 0.26960576515472656, + "grad_norm": 0.9650588782785957, + "learning_rate": 1.7136780496877493e-05, + "loss": 1.0095, + "step": 1590 + }, + { + "epoch": 0.26977532852903774, + "grad_norm": 1.0183899041551177, + "learning_rate": 1.7132932211631752e-05, + "loss": 0.9646, + "step": 1591 + }, + { + "epoch": 0.26994489190334886, + "grad_norm": 1.0734058709413015, + "learning_rate": 1.712908177471547e-05, + "loss": 1.039, + "step": 1592 + }, + { + "epoch": 0.27011445527766004, + "grad_norm": 1.057644396769284, + "learning_rate": 1.712522918729014e-05, + "loss": 1.0577, + "step": 1593 + }, + { + "epoch": 0.27028401865197116, + "grad_norm": 1.099128413877244, + "learning_rate": 1.712137445051792e-05, + "loss": 1.0646, + "step": 1594 + }, + { + "epoch": 0.27045358202628234, + "grad_norm": 1.0784746532945182, + "learning_rate": 1.7117517565561588e-05, + "loss": 1.0099, + "step": 1595 + }, + { + "epoch": 0.27062314540059346, + "grad_norm": 1.0815064123577858, + "learning_rate": 1.7113658533584594e-05, + "loss": 1.0474, + "step": 1596 + }, + { + "epoch": 0.27079270877490463, + "grad_norm": 0.6253365999082476, + "learning_rate": 1.7109797355751017e-05, + "loss": 0.8361, + "step": 1597 + }, + { + "epoch": 0.27096227214921575, + "grad_norm": 1.0061432879175651, + "learning_rate": 1.7105934033225607e-05, + "loss": 0.9849, + "step": 1598 + }, + { + "epoch": 0.27113183552352693, + "grad_norm": 1.122395492825743, + "learning_rate": 1.710206856717374e-05, + "loss": 1.0141, + "step": 1599 + }, + { + "epoch": 0.27130139889783805, + "grad_norm": 1.0267725651305846, + "learning_rate": 1.7098200958761443e-05, + "loss": 1.0249, + "step": 1600 + }, + { + "epoch": 0.27147096227214923, + "grad_norm": 0.9379371238372052, + "learning_rate": 1.7094331209155394e-05, + "loss": 0.9797, + "step": 1601 + }, + { + "epoch": 0.27164052564646035, + "grad_norm": 0.9798306107867417, + "learning_rate": 1.709045931952291e-05, + "loss": 1.0222, + "step": 1602 + }, + { + "epoch": 0.27181008902077153, + "grad_norm": 1.0358573882610302, + "learning_rate": 1.7086585291031968e-05, + "loss": 1.0292, + "step": 1603 + }, + { + "epoch": 0.27197965239508265, + "grad_norm": 1.0150268120023536, + "learning_rate": 1.7082709124851172e-05, + "loss": 1.0061, + "step": 1604 + }, + { + "epoch": 0.2721492157693938, + "grad_norm": 0.9705604079713331, + "learning_rate": 1.7078830822149784e-05, + "loss": 1.0007, + "step": 1605 + }, + { + "epoch": 0.27231877914370495, + "grad_norm": 0.997528917361982, + "learning_rate": 1.7074950384097703e-05, + "loss": 1.0399, + "step": 1606 + }, + { + "epoch": 0.2724883425180161, + "grad_norm": 0.9940037799319106, + "learning_rate": 1.7071067811865477e-05, + "loss": 1.0462, + "step": 1607 + }, + { + "epoch": 0.27265790589232725, + "grad_norm": 1.106455166868433, + "learning_rate": 1.7067183106624292e-05, + "loss": 1.0554, + "step": 1608 + }, + { + "epoch": 0.2728274692666384, + "grad_norm": 0.9750123911830852, + "learning_rate": 1.7063296269545988e-05, + "loss": 1.0002, + "step": 1609 + }, + { + "epoch": 0.27299703264094954, + "grad_norm": 1.0070456281082218, + "learning_rate": 1.7059407301803034e-05, + "loss": 1.0031, + "step": 1610 + }, + { + "epoch": 0.2731665960152607, + "grad_norm": 1.033720862686279, + "learning_rate": 1.7055516204568553e-05, + "loss": 1.0209, + "step": 1611 + }, + { + "epoch": 0.27333615938957184, + "grad_norm": 1.062785311091101, + "learning_rate": 1.70516229790163e-05, + "loss": 1.0289, + "step": 1612 + }, + { + "epoch": 0.273505722763883, + "grad_norm": 0.9655456115861552, + "learning_rate": 1.7047727626320688e-05, + "loss": 1.0059, + "step": 1613 + }, + { + "epoch": 0.27367528613819414, + "grad_norm": 1.0117964009532867, + "learning_rate": 1.704383014765676e-05, + "loss": 0.996, + "step": 1614 + }, + { + "epoch": 0.2738448495125053, + "grad_norm": 1.0450251648537119, + "learning_rate": 1.7039930544200194e-05, + "loss": 0.9912, + "step": 1615 + }, + { + "epoch": 0.27401441288681644, + "grad_norm": 1.0308121873368181, + "learning_rate": 1.703602881712732e-05, + "loss": 0.9985, + "step": 1616 + }, + { + "epoch": 0.2741839762611276, + "grad_norm": 1.0206060253113676, + "learning_rate": 1.7032124967615112e-05, + "loss": 1.0175, + "step": 1617 + }, + { + "epoch": 0.27435353963543874, + "grad_norm": 1.0337479581298368, + "learning_rate": 1.7028218996841173e-05, + "loss": 1.0275, + "step": 1618 + }, + { + "epoch": 0.2745231030097499, + "grad_norm": 1.0220572273887358, + "learning_rate": 1.7024310905983753e-05, + "loss": 1.0237, + "step": 1619 + }, + { + "epoch": 0.27469266638406103, + "grad_norm": 0.9903156721353659, + "learning_rate": 1.7020400696221737e-05, + "loss": 0.9867, + "step": 1620 + }, + { + "epoch": 0.2748622297583722, + "grad_norm": 1.0309457605974588, + "learning_rate": 1.7016488368734654e-05, + "loss": 1.0694, + "step": 1621 + }, + { + "epoch": 0.27503179313268333, + "grad_norm": 0.975180348499719, + "learning_rate": 1.701257392470267e-05, + "loss": 1.0281, + "step": 1622 + }, + { + "epoch": 0.2752013565069945, + "grad_norm": 1.0285842441451898, + "learning_rate": 1.700865736530658e-05, + "loss": 1.0802, + "step": 1623 + }, + { + "epoch": 0.27537091988130563, + "grad_norm": 0.9582058969760604, + "learning_rate": 1.700473869172784e-05, + "loss": 1.012, + "step": 1624 + }, + { + "epoch": 0.2755404832556168, + "grad_norm": 0.9926281593540695, + "learning_rate": 1.7000817905148523e-05, + "loss": 0.9814, + "step": 1625 + }, + { + "epoch": 0.27571004662992793, + "grad_norm": 1.050820290516294, + "learning_rate": 1.699689500675134e-05, + "loss": 0.9965, + "step": 1626 + }, + { + "epoch": 0.2758796100042391, + "grad_norm": 1.0412493165414156, + "learning_rate": 1.6992969997719658e-05, + "loss": 0.9839, + "step": 1627 + }, + { + "epoch": 0.2760491733785502, + "grad_norm": 1.0028225922310317, + "learning_rate": 1.698904287923746e-05, + "loss": 1.0033, + "step": 1628 + }, + { + "epoch": 0.2762187367528614, + "grad_norm": 0.9626776510354971, + "learning_rate": 1.6985113652489374e-05, + "loss": 0.9842, + "step": 1629 + }, + { + "epoch": 0.2763883001271725, + "grad_norm": 1.0083281465246772, + "learning_rate": 1.698118231866066e-05, + "loss": 1.0301, + "step": 1630 + }, + { + "epoch": 0.2765578635014837, + "grad_norm": 0.9948967001337197, + "learning_rate": 1.697724887893722e-05, + "loss": 0.9978, + "step": 1631 + }, + { + "epoch": 0.2767274268757948, + "grad_norm": 1.038288911389693, + "learning_rate": 1.697331333450559e-05, + "loss": 1.0137, + "step": 1632 + }, + { + "epoch": 0.276896990250106, + "grad_norm": 0.9387921431254617, + "learning_rate": 1.696937568655294e-05, + "loss": 1.0008, + "step": 1633 + }, + { + "epoch": 0.2770665536244171, + "grad_norm": 0.9927125216040253, + "learning_rate": 1.6965435936267063e-05, + "loss": 1.0014, + "step": 1634 + }, + { + "epoch": 0.2772361169987283, + "grad_norm": 0.9807282902798052, + "learning_rate": 1.6961494084836405e-05, + "loss": 1.0125, + "step": 1635 + }, + { + "epoch": 0.2774056803730394, + "grad_norm": 0.9869988153860896, + "learning_rate": 1.695755013345004e-05, + "loss": 1.0479, + "step": 1636 + }, + { + "epoch": 0.2775752437473506, + "grad_norm": 0.9586014093732598, + "learning_rate": 1.6953604083297665e-05, + "loss": 1.0216, + "step": 1637 + }, + { + "epoch": 0.2777448071216617, + "grad_norm": 0.9986941688715908, + "learning_rate": 1.6949655935569627e-05, + "loss": 1.0192, + "step": 1638 + }, + { + "epoch": 0.2779143704959729, + "grad_norm": 0.9921378683440467, + "learning_rate": 1.6945705691456888e-05, + "loss": 1.0166, + "step": 1639 + }, + { + "epoch": 0.278083933870284, + "grad_norm": 0.9985160017656568, + "learning_rate": 1.6941753352151057e-05, + "loss": 1.0269, + "step": 1640 + }, + { + "epoch": 0.2782534972445952, + "grad_norm": 0.9993921652217467, + "learning_rate": 1.6937798918844363e-05, + "loss": 1.0241, + "step": 1641 + }, + { + "epoch": 0.2784230606189063, + "grad_norm": 1.0423824485925233, + "learning_rate": 1.6933842392729677e-05, + "loss": 1.0285, + "step": 1642 + }, + { + "epoch": 0.2785926239932175, + "grad_norm": 1.017623427485035, + "learning_rate": 1.69298837750005e-05, + "loss": 0.9646, + "step": 1643 + }, + { + "epoch": 0.2787621873675286, + "grad_norm": 0.9797211102548051, + "learning_rate": 1.6925923066850957e-05, + "loss": 0.9992, + "step": 1644 + }, + { + "epoch": 0.2789317507418398, + "grad_norm": 1.0497242337533619, + "learning_rate": 1.6921960269475806e-05, + "loss": 0.9898, + "step": 1645 + }, + { + "epoch": 0.2791013141161509, + "grad_norm": 0.9845819080920777, + "learning_rate": 1.691799538407044e-05, + "loss": 1.0077, + "step": 1646 + }, + { + "epoch": 0.27927087749046203, + "grad_norm": 1.0087890382371902, + "learning_rate": 1.691402841183088e-05, + "loss": 0.9614, + "step": 1647 + }, + { + "epoch": 0.2794404408647732, + "grad_norm": 1.0018423463699977, + "learning_rate": 1.6910059353953765e-05, + "loss": 0.9937, + "step": 1648 + }, + { + "epoch": 0.27961000423908433, + "grad_norm": 0.9927419196455894, + "learning_rate": 1.6906088211636387e-05, + "loss": 1.0281, + "step": 1649 + }, + { + "epoch": 0.2797795676133955, + "grad_norm": 0.9961599419086233, + "learning_rate": 1.6902114986076645e-05, + "loss": 1.0207, + "step": 1650 + }, + { + "epoch": 0.27994913098770663, + "grad_norm": 0.9711712043805547, + "learning_rate": 1.689813967847308e-05, + "loss": 1.0257, + "step": 1651 + }, + { + "epoch": 0.2801186943620178, + "grad_norm": 1.01609985737106, + "learning_rate": 1.6894162290024848e-05, + "loss": 1.0497, + "step": 1652 + }, + { + "epoch": 0.2802882577363289, + "grad_norm": 0.9975132655584638, + "learning_rate": 1.6890182821931746e-05, + "loss": 0.979, + "step": 1653 + }, + { + "epoch": 0.2804578211106401, + "grad_norm": 1.0228467490700406, + "learning_rate": 1.6886201275394193e-05, + "loss": 1.0032, + "step": 1654 + }, + { + "epoch": 0.2806273844849512, + "grad_norm": 1.0191542366571857, + "learning_rate": 1.688221765161323e-05, + "loss": 1.036, + "step": 1655 + }, + { + "epoch": 0.2807969478592624, + "grad_norm": 0.9612783895285227, + "learning_rate": 1.687823195179053e-05, + "loss": 0.9963, + "step": 1656 + }, + { + "epoch": 0.2809665112335735, + "grad_norm": 1.1322316892118682, + "learning_rate": 1.6874244177128395e-05, + "loss": 0.971, + "step": 1657 + }, + { + "epoch": 0.2811360746078847, + "grad_norm": 1.0227971882768998, + "learning_rate": 1.6870254328829748e-05, + "loss": 1.0435, + "step": 1658 + }, + { + "epoch": 0.2813056379821958, + "grad_norm": 1.0452956120131587, + "learning_rate": 1.6866262408098134e-05, + "loss": 1.0181, + "step": 1659 + }, + { + "epoch": 0.281475201356507, + "grad_norm": 1.0107572660503896, + "learning_rate": 1.6862268416137738e-05, + "loss": 0.9973, + "step": 1660 + }, + { + "epoch": 0.2816447647308181, + "grad_norm": 1.010141714195555, + "learning_rate": 1.685827235415335e-05, + "loss": 1.0111, + "step": 1661 + }, + { + "epoch": 0.2818143281051293, + "grad_norm": 1.0538601111497838, + "learning_rate": 1.68542742233504e-05, + "loss": 0.9921, + "step": 1662 + }, + { + "epoch": 0.2819838914794404, + "grad_norm": 1.046633623577513, + "learning_rate": 1.685027402493493e-05, + "loss": 1.0528, + "step": 1663 + }, + { + "epoch": 0.2821534548537516, + "grad_norm": 1.0606159042913585, + "learning_rate": 1.684627176011362e-05, + "loss": 1.0116, + "step": 1664 + }, + { + "epoch": 0.2823230182280627, + "grad_norm": 1.0100434793375952, + "learning_rate": 1.6842267430093762e-05, + "loss": 1.0165, + "step": 1665 + }, + { + "epoch": 0.2824925816023739, + "grad_norm": 1.0438581417039956, + "learning_rate": 1.683826103608327e-05, + "loss": 1.0372, + "step": 1666 + }, + { + "epoch": 0.282662144976685, + "grad_norm": 0.9879939539761703, + "learning_rate": 1.6834252579290692e-05, + "loss": 0.9725, + "step": 1667 + }, + { + "epoch": 0.2828317083509962, + "grad_norm": 1.0270057895685785, + "learning_rate": 1.6830242060925184e-05, + "loss": 1.0373, + "step": 1668 + }, + { + "epoch": 0.2830012717253073, + "grad_norm": 1.040186779060928, + "learning_rate": 1.6826229482196535e-05, + "loss": 1.0437, + "step": 1669 + }, + { + "epoch": 0.2831708350996185, + "grad_norm": 0.963710584533114, + "learning_rate": 1.6822214844315152e-05, + "loss": 0.985, + "step": 1670 + }, + { + "epoch": 0.2833403984739296, + "grad_norm": 0.6746281060544586, + "learning_rate": 1.681819814849206e-05, + "loss": 0.8753, + "step": 1671 + }, + { + "epoch": 0.2835099618482408, + "grad_norm": 0.9804017133537226, + "learning_rate": 1.6814179395938915e-05, + "loss": 1.0046, + "step": 1672 + }, + { + "epoch": 0.2836795252225519, + "grad_norm": 1.1161363116122796, + "learning_rate": 1.6810158587867973e-05, + "loss": 1.0102, + "step": 1673 + }, + { + "epoch": 0.2838490885968631, + "grad_norm": 1.014553654632016, + "learning_rate": 1.6806135725492133e-05, + "loss": 1.0097, + "step": 1674 + }, + { + "epoch": 0.2840186519711742, + "grad_norm": 1.015140415745618, + "learning_rate": 1.68021108100249e-05, + "loss": 0.997, + "step": 1675 + }, + { + "epoch": 0.2841882153454854, + "grad_norm": 0.9803691346395613, + "learning_rate": 1.6798083842680402e-05, + "loss": 1.0058, + "step": 1676 + }, + { + "epoch": 0.2843577787197965, + "grad_norm": 0.9915608851870706, + "learning_rate": 1.679405482467338e-05, + "loss": 1.0488, + "step": 1677 + }, + { + "epoch": 0.2845273420941077, + "grad_norm": 0.9474478180574198, + "learning_rate": 1.6790023757219215e-05, + "loss": 1.0092, + "step": 1678 + }, + { + "epoch": 0.2846969054684188, + "grad_norm": 1.0363667115985078, + "learning_rate": 1.6785990641533878e-05, + "loss": 1.0033, + "step": 1679 + }, + { + "epoch": 0.28486646884273, + "grad_norm": 1.033268026108578, + "learning_rate": 1.6781955478833973e-05, + "loss": 1.0009, + "step": 1680 + }, + { + "epoch": 0.2850360322170411, + "grad_norm": 1.032629367630231, + "learning_rate": 1.6777918270336718e-05, + "loss": 1.0175, + "step": 1681 + }, + { + "epoch": 0.2852055955913523, + "grad_norm": 0.9806034247156681, + "learning_rate": 1.6773879017259954e-05, + "loss": 1.0341, + "step": 1682 + }, + { + "epoch": 0.2853751589656634, + "grad_norm": 1.0507339071659962, + "learning_rate": 1.676983772082213e-05, + "loss": 1.0164, + "step": 1683 + }, + { + "epoch": 0.2855447223399746, + "grad_norm": 1.0024353354442337, + "learning_rate": 1.6765794382242315e-05, + "loss": 1.0179, + "step": 1684 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.068319577710856, + "learning_rate": 1.6761749002740195e-05, + "loss": 1.0089, + "step": 1685 + }, + { + "epoch": 0.2858838490885969, + "grad_norm": 0.9878217601950019, + "learning_rate": 1.675770158353607e-05, + "loss": 1.0213, + "step": 1686 + }, + { + "epoch": 0.286053412462908, + "grad_norm": 1.0041279074298652, + "learning_rate": 1.6753652125850865e-05, + "loss": 1.0227, + "step": 1687 + }, + { + "epoch": 0.2862229758372192, + "grad_norm": 0.9762471508102554, + "learning_rate": 1.6749600630906097e-05, + "loss": 0.9977, + "step": 1688 + }, + { + "epoch": 0.2863925392115303, + "grad_norm": 1.0002991927519336, + "learning_rate": 1.6745547099923917e-05, + "loss": 0.9828, + "step": 1689 + }, + { + "epoch": 0.28656210258584147, + "grad_norm": 0.9748061224336259, + "learning_rate": 1.674149153412709e-05, + "loss": 1.0047, + "step": 1690 + }, + { + "epoch": 0.2867316659601526, + "grad_norm": 0.9939597921609324, + "learning_rate": 1.6737433934738984e-05, + "loss": 1.0346, + "step": 1691 + }, + { + "epoch": 0.28690122933446377, + "grad_norm": 0.9342447749963945, + "learning_rate": 1.673337430298359e-05, + "loss": 1.0097, + "step": 1692 + }, + { + "epoch": 0.2870707927087749, + "grad_norm": 0.9817004159920719, + "learning_rate": 1.6729312640085504e-05, + "loss": 0.9667, + "step": 1693 + }, + { + "epoch": 0.28724035608308607, + "grad_norm": 1.0278543887202118, + "learning_rate": 1.6725248947269944e-05, + "loss": 1.0388, + "step": 1694 + }, + { + "epoch": 0.2874099194573972, + "grad_norm": 0.7823900191193831, + "learning_rate": 1.6721183225762726e-05, + "loss": 0.8986, + "step": 1695 + }, + { + "epoch": 0.28757948283170837, + "grad_norm": 0.9761424079172065, + "learning_rate": 1.67171154767903e-05, + "loss": 0.9814, + "step": 1696 + }, + { + "epoch": 0.2877490462060195, + "grad_norm": 0.9531139091992797, + "learning_rate": 1.6713045701579705e-05, + "loss": 0.9702, + "step": 1697 + }, + { + "epoch": 0.28791860958033066, + "grad_norm": 1.0339778199641958, + "learning_rate": 1.6708973901358603e-05, + "loss": 1.0196, + "step": 1698 + }, + { + "epoch": 0.2880881729546418, + "grad_norm": 1.0510497928905038, + "learning_rate": 1.6704900077355267e-05, + "loss": 1.0233, + "step": 1699 + }, + { + "epoch": 0.28825773632895296, + "grad_norm": 1.0071444924742048, + "learning_rate": 1.670082423079858e-05, + "loss": 0.9955, + "step": 1700 + }, + { + "epoch": 0.2884272997032641, + "grad_norm": 1.0091169399255349, + "learning_rate": 1.6696746362918027e-05, + "loss": 1.0338, + "step": 1701 + }, + { + "epoch": 0.28859686307757526, + "grad_norm": 1.0595147055921148, + "learning_rate": 1.6692666474943714e-05, + "loss": 1.036, + "step": 1702 + }, + { + "epoch": 0.2887664264518864, + "grad_norm": 0.9912060867006545, + "learning_rate": 1.668858456810635e-05, + "loss": 0.9894, + "step": 1703 + }, + { + "epoch": 0.28893598982619756, + "grad_norm": 0.9580534910764097, + "learning_rate": 1.6684500643637256e-05, + "loss": 1.0321, + "step": 1704 + }, + { + "epoch": 0.2891055532005087, + "grad_norm": 0.9357886309433708, + "learning_rate": 1.6680414702768358e-05, + "loss": 0.9652, + "step": 1705 + }, + { + "epoch": 0.28927511657481986, + "grad_norm": 0.9636332083145411, + "learning_rate": 1.6676326746732197e-05, + "loss": 0.9909, + "step": 1706 + }, + { + "epoch": 0.289444679949131, + "grad_norm": 1.0023999344361916, + "learning_rate": 1.6672236776761906e-05, + "loss": 0.9862, + "step": 1707 + }, + { + "epoch": 0.28961424332344216, + "grad_norm": 0.976587780893446, + "learning_rate": 1.6668144794091254e-05, + "loss": 0.9914, + "step": 1708 + }, + { + "epoch": 0.2897838066977533, + "grad_norm": 1.034296314547433, + "learning_rate": 1.6664050799954587e-05, + "loss": 0.998, + "step": 1709 + }, + { + "epoch": 0.28995337007206445, + "grad_norm": 0.9854131890683052, + "learning_rate": 1.665995479558687e-05, + "loss": 1.0331, + "step": 1710 + }, + { + "epoch": 0.2901229334463756, + "grad_norm": 0.9608910285703723, + "learning_rate": 1.6655856782223682e-05, + "loss": 0.9443, + "step": 1711 + }, + { + "epoch": 0.29029249682068675, + "grad_norm": 0.9797247146404183, + "learning_rate": 1.6651756761101202e-05, + "loss": 1.0347, + "step": 1712 + }, + { + "epoch": 0.2904620601949979, + "grad_norm": 0.946135130562679, + "learning_rate": 1.66476547334562e-05, + "loss": 0.9851, + "step": 1713 + }, + { + "epoch": 0.29063162356930905, + "grad_norm": 0.9998182905903034, + "learning_rate": 1.6643550700526084e-05, + "loss": 0.9933, + "step": 1714 + }, + { + "epoch": 0.29080118694362017, + "grad_norm": 0.9992474680091024, + "learning_rate": 1.663944466354884e-05, + "loss": 1.0217, + "step": 1715 + }, + { + "epoch": 0.29097075031793135, + "grad_norm": 0.9892955976424613, + "learning_rate": 1.663533662376306e-05, + "loss": 0.9865, + "step": 1716 + }, + { + "epoch": 0.29114031369224247, + "grad_norm": 1.0416426195644528, + "learning_rate": 1.6631226582407954e-05, + "loss": 1.0465, + "step": 1717 + }, + { + "epoch": 0.29130987706655365, + "grad_norm": 0.9639488178238013, + "learning_rate": 1.6627114540723327e-05, + "loss": 1.0104, + "step": 1718 + }, + { + "epoch": 0.29147944044086477, + "grad_norm": 0.9303799913380203, + "learning_rate": 1.6623000499949586e-05, + "loss": 0.9999, + "step": 1719 + }, + { + "epoch": 0.29164900381517594, + "grad_norm": 0.993267452886458, + "learning_rate": 1.6618884461327747e-05, + "loss": 1.005, + "step": 1720 + }, + { + "epoch": 0.29181856718948707, + "grad_norm": 0.9793605748736506, + "learning_rate": 1.661476642609943e-05, + "loss": 0.9992, + "step": 1721 + }, + { + "epoch": 0.29198813056379824, + "grad_norm": 1.0006406629500666, + "learning_rate": 1.661064639550684e-05, + "loss": 1.0462, + "step": 1722 + }, + { + "epoch": 0.29215769393810936, + "grad_norm": 0.9858334384178165, + "learning_rate": 1.6606524370792806e-05, + "loss": 1.0186, + "step": 1723 + }, + { + "epoch": 0.29232725731242054, + "grad_norm": 0.7210513604013623, + "learning_rate": 1.660240035320075e-05, + "loss": 0.8266, + "step": 1724 + }, + { + "epoch": 0.29249682068673166, + "grad_norm": 1.0010519453769744, + "learning_rate": 1.6598274343974688e-05, + "loss": 1.0034, + "step": 1725 + }, + { + "epoch": 0.29266638406104284, + "grad_norm": 0.973236506156456, + "learning_rate": 1.659414634435925e-05, + "loss": 1.0245, + "step": 1726 + }, + { + "epoch": 0.29283594743535396, + "grad_norm": 1.0548680249557763, + "learning_rate": 1.6590016355599653e-05, + "loss": 1.0698, + "step": 1727 + }, + { + "epoch": 0.29300551080966514, + "grad_norm": 0.6713346677268494, + "learning_rate": 1.6585884378941727e-05, + "loss": 0.8334, + "step": 1728 + }, + { + "epoch": 0.29317507418397626, + "grad_norm": 1.022166571418792, + "learning_rate": 1.658175041563189e-05, + "loss": 1.0397, + "step": 1729 + }, + { + "epoch": 0.29334463755828744, + "grad_norm": 1.016097893708613, + "learning_rate": 1.6577614466917168e-05, + "loss": 0.9971, + "step": 1730 + }, + { + "epoch": 0.29351420093259856, + "grad_norm": 1.0536720803219268, + "learning_rate": 1.657347653404518e-05, + "loss": 0.9915, + "step": 1731 + }, + { + "epoch": 0.29368376430690973, + "grad_norm": 1.0061859178475074, + "learning_rate": 1.6569336618264143e-05, + "loss": 1.0175, + "step": 1732 + }, + { + "epoch": 0.29385332768122086, + "grad_norm": 0.9673002936320002, + "learning_rate": 1.6565194720822885e-05, + "loss": 0.9918, + "step": 1733 + }, + { + "epoch": 0.29402289105553203, + "grad_norm": 1.0616234863800256, + "learning_rate": 1.6561050842970817e-05, + "loss": 1.0134, + "step": 1734 + }, + { + "epoch": 0.29419245442984315, + "grad_norm": 1.0319285346864768, + "learning_rate": 1.6556904985957946e-05, + "loss": 1.0286, + "step": 1735 + }, + { + "epoch": 0.29436201780415433, + "grad_norm": 1.0091347917174902, + "learning_rate": 1.655275715103489e-05, + "loss": 1.0277, + "step": 1736 + }, + { + "epoch": 0.29453158117846545, + "grad_norm": 1.0322359324464947, + "learning_rate": 1.6548607339452853e-05, + "loss": 1.0317, + "step": 1737 + }, + { + "epoch": 0.2947011445527766, + "grad_norm": 0.9832355505979721, + "learning_rate": 1.6544455552463637e-05, + "loss": 0.9666, + "step": 1738 + }, + { + "epoch": 0.29487070792708775, + "grad_norm": 1.0348463908522592, + "learning_rate": 1.6540301791319647e-05, + "loss": 1.0522, + "step": 1739 + }, + { + "epoch": 0.29504027130139887, + "grad_norm": 1.0509929650105039, + "learning_rate": 1.653614605727387e-05, + "loss": 1.0634, + "step": 1740 + }, + { + "epoch": 0.29520983467571005, + "grad_norm": 0.9715754676744554, + "learning_rate": 1.6531988351579897e-05, + "loss": 1.0131, + "step": 1741 + }, + { + "epoch": 0.29537939805002117, + "grad_norm": 1.0338172489487074, + "learning_rate": 1.6527828675491917e-05, + "loss": 0.9918, + "step": 1742 + }, + { + "epoch": 0.29554896142433235, + "grad_norm": 0.9935939626541185, + "learning_rate": 1.6523667030264706e-05, + "loss": 1.021, + "step": 1743 + }, + { + "epoch": 0.29571852479864347, + "grad_norm": 1.0204378829862049, + "learning_rate": 1.6519503417153638e-05, + "loss": 0.9787, + "step": 1744 + }, + { + "epoch": 0.29588808817295464, + "grad_norm": 1.0164353888366213, + "learning_rate": 1.6515337837414677e-05, + "loss": 1.0167, + "step": 1745 + }, + { + "epoch": 0.29605765154726577, + "grad_norm": 0.9724056723108322, + "learning_rate": 1.6511170292304385e-05, + "loss": 0.977, + "step": 1746 + }, + { + "epoch": 0.29622721492157694, + "grad_norm": 1.0048508098530742, + "learning_rate": 1.6507000783079913e-05, + "loss": 1.0219, + "step": 1747 + }, + { + "epoch": 0.29639677829588806, + "grad_norm": 1.060458643969409, + "learning_rate": 1.6502829310999012e-05, + "loss": 1.0183, + "step": 1748 + }, + { + "epoch": 0.29656634167019924, + "grad_norm": 1.0635806815416127, + "learning_rate": 1.6498655877320008e-05, + "loss": 1.0148, + "step": 1749 + }, + { + "epoch": 0.29673590504451036, + "grad_norm": 0.9951429541588573, + "learning_rate": 1.6494480483301836e-05, + "loss": 1.0172, + "step": 1750 + }, + { + "epoch": 0.29690546841882154, + "grad_norm": 1.0047707907680026, + "learning_rate": 1.649030313020402e-05, + "loss": 1.0415, + "step": 1751 + }, + { + "epoch": 0.29707503179313266, + "grad_norm": 1.0080379685322103, + "learning_rate": 1.6486123819286666e-05, + "loss": 1.002, + "step": 1752 + }, + { + "epoch": 0.29724459516744384, + "grad_norm": 1.0224586200512065, + "learning_rate": 1.6481942551810476e-05, + "loss": 1.0342, + "step": 1753 + }, + { + "epoch": 0.29741415854175496, + "grad_norm": 1.0444815240757956, + "learning_rate": 1.6477759329036743e-05, + "loss": 1.0226, + "step": 1754 + }, + { + "epoch": 0.29758372191606614, + "grad_norm": 0.94617496593671, + "learning_rate": 1.647357415222735e-05, + "loss": 0.9842, + "step": 1755 + }, + { + "epoch": 0.29775328529037726, + "grad_norm": 0.9646250591216861, + "learning_rate": 1.6469387022644768e-05, + "loss": 0.985, + "step": 1756 + }, + { + "epoch": 0.29792284866468843, + "grad_norm": 1.0025500245054189, + "learning_rate": 1.6465197941552054e-05, + "loss": 0.9895, + "step": 1757 + }, + { + "epoch": 0.29809241203899955, + "grad_norm": 0.9637417682856418, + "learning_rate": 1.646100691021286e-05, + "loss": 0.9869, + "step": 1758 + }, + { + "epoch": 0.29826197541331073, + "grad_norm": 0.7686090701175982, + "learning_rate": 1.6456813929891425e-05, + "loss": 0.9297, + "step": 1759 + }, + { + "epoch": 0.29843153878762185, + "grad_norm": 1.025200665912597, + "learning_rate": 1.6452619001852567e-05, + "loss": 1.0113, + "step": 1760 + }, + { + "epoch": 0.29860110216193303, + "grad_norm": 1.0006621563219802, + "learning_rate": 1.6448422127361707e-05, + "loss": 0.9735, + "step": 1761 + }, + { + "epoch": 0.29877066553624415, + "grad_norm": 0.9468374829701318, + "learning_rate": 1.6444223307684844e-05, + "loss": 0.9841, + "step": 1762 + }, + { + "epoch": 0.29894022891055533, + "grad_norm": 0.9577090668403472, + "learning_rate": 1.6440022544088553e-05, + "loss": 0.9567, + "step": 1763 + }, + { + "epoch": 0.29910979228486645, + "grad_norm": 0.9735054188231101, + "learning_rate": 1.6435819837840026e-05, + "loss": 0.9724, + "step": 1764 + }, + { + "epoch": 0.2992793556591776, + "grad_norm": 0.9656507668924623, + "learning_rate": 1.6431615190207003e-05, + "loss": 1.0553, + "step": 1765 + }, + { + "epoch": 0.29944891903348875, + "grad_norm": 0.9597257964315393, + "learning_rate": 1.6427408602457845e-05, + "loss": 0.9624, + "step": 1766 + }, + { + "epoch": 0.2996184824077999, + "grad_norm": 0.9921723334689856, + "learning_rate": 1.6423200075861472e-05, + "loss": 1.016, + "step": 1767 + }, + { + "epoch": 0.29978804578211105, + "grad_norm": 1.0044958475699377, + "learning_rate": 1.64189896116874e-05, + "loss": 1.004, + "step": 1768 + }, + { + "epoch": 0.2999576091564222, + "grad_norm": 0.9823954300465818, + "learning_rate": 1.641477721120573e-05, + "loss": 0.9965, + "step": 1769 + }, + { + "epoch": 0.30012717253073334, + "grad_norm": 1.0092359267461424, + "learning_rate": 1.6410562875687145e-05, + "loss": 1.0, + "step": 1770 + }, + { + "epoch": 0.3002967359050445, + "grad_norm": 0.9885825830035312, + "learning_rate": 1.6406346606402913e-05, + "loss": 1.0266, + "step": 1771 + }, + { + "epoch": 0.30046629927935564, + "grad_norm": 0.9804222980130869, + "learning_rate": 1.640212840462488e-05, + "loss": 1.0018, + "step": 1772 + }, + { + "epoch": 0.3006358626536668, + "grad_norm": 1.007118270528555, + "learning_rate": 1.6397908271625488e-05, + "loss": 1.0034, + "step": 1773 + }, + { + "epoch": 0.30080542602797794, + "grad_norm": 0.9492726400708721, + "learning_rate": 1.6393686208677744e-05, + "loss": 1.0101, + "step": 1774 + }, + { + "epoch": 0.3009749894022891, + "grad_norm": 0.9963323404652323, + "learning_rate": 1.638946221705525e-05, + "loss": 1.0242, + "step": 1775 + }, + { + "epoch": 0.30114455277660024, + "grad_norm": 0.985676913192754, + "learning_rate": 1.6385236298032183e-05, + "loss": 0.9739, + "step": 1776 + }, + { + "epoch": 0.3013141161509114, + "grad_norm": 1.0043484053972493, + "learning_rate": 1.638100845288331e-05, + "loss": 1.0246, + "step": 1777 + }, + { + "epoch": 0.30148367952522254, + "grad_norm": 0.9916119223229497, + "learning_rate": 1.6376778682883968e-05, + "loss": 0.9945, + "step": 1778 + }, + { + "epoch": 0.3016532428995337, + "grad_norm": 0.984423027045273, + "learning_rate": 1.6372546989310083e-05, + "loss": 1.0455, + "step": 1779 + }, + { + "epoch": 0.30182280627384483, + "grad_norm": 1.044141024273506, + "learning_rate": 1.6368313373438157e-05, + "loss": 1.0018, + "step": 1780 + }, + { + "epoch": 0.301992369648156, + "grad_norm": 0.9919904361515511, + "learning_rate": 1.636407783654527e-05, + "loss": 1.0132, + "step": 1781 + }, + { + "epoch": 0.30216193302246713, + "grad_norm": 0.9889974008922935, + "learning_rate": 1.6359840379909088e-05, + "loss": 1.0367, + "step": 1782 + }, + { + "epoch": 0.3023314963967783, + "grad_norm": 1.025382901857874, + "learning_rate": 1.6355601004807856e-05, + "loss": 1.0191, + "step": 1783 + }, + { + "epoch": 0.30250105977108943, + "grad_norm": 1.0163750107875882, + "learning_rate": 1.6351359712520383e-05, + "loss": 1.0166, + "step": 1784 + }, + { + "epoch": 0.3026706231454006, + "grad_norm": 1.0280941856356662, + "learning_rate": 1.6347116504326082e-05, + "loss": 1.0392, + "step": 1785 + }, + { + "epoch": 0.30284018651971173, + "grad_norm": 0.9960837363552101, + "learning_rate": 1.6342871381504916e-05, + "loss": 0.9881, + "step": 1786 + }, + { + "epoch": 0.3030097498940229, + "grad_norm": 0.9741358282430439, + "learning_rate": 1.6338624345337452e-05, + "loss": 0.9967, + "step": 1787 + }, + { + "epoch": 0.303179313268334, + "grad_norm": 0.9927672707720894, + "learning_rate": 1.6334375397104813e-05, + "loss": 1.018, + "step": 1788 + }, + { + "epoch": 0.3033488766426452, + "grad_norm": 0.98130278736917, + "learning_rate": 1.6330124538088705e-05, + "loss": 1.0085, + "step": 1789 + }, + { + "epoch": 0.3035184400169563, + "grad_norm": 1.0359025823832217, + "learning_rate": 1.632587176957142e-05, + "loss": 1.0152, + "step": 1790 + }, + { + "epoch": 0.3036880033912675, + "grad_norm": 0.7373946183970215, + "learning_rate": 1.6321617092835813e-05, + "loss": 0.8841, + "step": 1791 + }, + { + "epoch": 0.3038575667655786, + "grad_norm": 0.963318794273647, + "learning_rate": 1.631736050916532e-05, + "loss": 0.9714, + "step": 1792 + }, + { + "epoch": 0.3040271301398898, + "grad_norm": 1.0349902500299886, + "learning_rate": 1.631310201984396e-05, + "loss": 1.0408, + "step": 1793 + }, + { + "epoch": 0.3041966935142009, + "grad_norm": 1.0126851909912866, + "learning_rate": 1.630884162615631e-05, + "loss": 1.0425, + "step": 1794 + }, + { + "epoch": 0.3043662568885121, + "grad_norm": 0.9493235675489007, + "learning_rate": 1.6304579329387534e-05, + "loss": 0.9908, + "step": 1795 + }, + { + "epoch": 0.3045358202628232, + "grad_norm": 0.9899299579017352, + "learning_rate": 1.6300315130823366e-05, + "loss": 0.9958, + "step": 1796 + }, + { + "epoch": 0.3047053836371344, + "grad_norm": 1.0570092320556699, + "learning_rate": 1.6296049031750114e-05, + "loss": 1.0337, + "step": 1797 + }, + { + "epoch": 0.3048749470114455, + "grad_norm": 0.9275633015379959, + "learning_rate": 1.6291781033454664e-05, + "loss": 0.9443, + "step": 1798 + }, + { + "epoch": 0.3050445103857567, + "grad_norm": 0.9731472518937835, + "learning_rate": 1.6287511137224467e-05, + "loss": 0.9875, + "step": 1799 + }, + { + "epoch": 0.3052140737600678, + "grad_norm": 0.9797427813970988, + "learning_rate": 1.6283239344347547e-05, + "loss": 1.0384, + "step": 1800 + }, + { + "epoch": 0.305383637134379, + "grad_norm": 0.9555816393377661, + "learning_rate": 1.627896565611251e-05, + "loss": 1.0155, + "step": 1801 + }, + { + "epoch": 0.3055532005086901, + "grad_norm": 0.9860341821838224, + "learning_rate": 1.627469007380852e-05, + "loss": 0.9974, + "step": 1802 + }, + { + "epoch": 0.3057227638830013, + "grad_norm": 0.9590834289172773, + "learning_rate": 1.6270412598725326e-05, + "loss": 1.0198, + "step": 1803 + }, + { + "epoch": 0.3058923272573124, + "grad_norm": 1.0050559504718641, + "learning_rate": 1.626613323215324e-05, + "loss": 1.0407, + "step": 1804 + }, + { + "epoch": 0.3060618906316236, + "grad_norm": 0.9423169298203447, + "learning_rate": 1.626185197538314e-05, + "loss": 0.9977, + "step": 1805 + }, + { + "epoch": 0.3062314540059347, + "grad_norm": 0.9864377869764335, + "learning_rate": 1.6257568829706483e-05, + "loss": 1.0391, + "step": 1806 + }, + { + "epoch": 0.3064010173802459, + "grad_norm": 0.9424617691749402, + "learning_rate": 1.6253283796415294e-05, + "loss": 0.9966, + "step": 1807 + }, + { + "epoch": 0.306570580754557, + "grad_norm": 0.9527314215754243, + "learning_rate": 1.624899687680217e-05, + "loss": 0.9941, + "step": 1808 + }, + { + "epoch": 0.3067401441288682, + "grad_norm": 1.0725516587971475, + "learning_rate": 1.6244708072160267e-05, + "loss": 1.0614, + "step": 1809 + }, + { + "epoch": 0.3069097075031793, + "grad_norm": 0.9858268411474339, + "learning_rate": 1.624041738378332e-05, + "loss": 0.9965, + "step": 1810 + }, + { + "epoch": 0.3070792708774905, + "grad_norm": 0.9505983220531846, + "learning_rate": 1.6236124812965622e-05, + "loss": 0.9687, + "step": 1811 + }, + { + "epoch": 0.3072488342518016, + "grad_norm": 0.9925222488352277, + "learning_rate": 1.623183036100205e-05, + "loss": 0.9858, + "step": 1812 + }, + { + "epoch": 0.3074183976261128, + "grad_norm": 0.9789615847416095, + "learning_rate": 1.6227534029188027e-05, + "loss": 1.0156, + "step": 1813 + }, + { + "epoch": 0.3075879610004239, + "grad_norm": 0.9644609138414463, + "learning_rate": 1.6223235818819564e-05, + "loss": 0.9756, + "step": 1814 + }, + { + "epoch": 0.3077575243747351, + "grad_norm": 0.9880783419658973, + "learning_rate": 1.6218935731193223e-05, + "loss": 1.0069, + "step": 1815 + }, + { + "epoch": 0.3079270877490462, + "grad_norm": 1.0281460358283794, + "learning_rate": 1.6214633767606142e-05, + "loss": 1.0077, + "step": 1816 + }, + { + "epoch": 0.3080966511233574, + "grad_norm": 0.9685630942387603, + "learning_rate": 1.6210329929356017e-05, + "loss": 1.0027, + "step": 1817 + }, + { + "epoch": 0.3082662144976685, + "grad_norm": 0.6556947755831345, + "learning_rate": 1.6206024217741125e-05, + "loss": 0.8692, + "step": 1818 + }, + { + "epoch": 0.3084357778719797, + "grad_norm": 1.0191152051284371, + "learning_rate": 1.620171663406028e-05, + "loss": 1.0353, + "step": 1819 + }, + { + "epoch": 0.3086053412462908, + "grad_norm": 0.9248084723691711, + "learning_rate": 1.619740717961289e-05, + "loss": 0.9497, + "step": 1820 + }, + { + "epoch": 0.308774904620602, + "grad_norm": 1.017505589072633, + "learning_rate": 1.619309585569891e-05, + "loss": 0.9909, + "step": 1821 + }, + { + "epoch": 0.3089444679949131, + "grad_norm": 0.9711824335427821, + "learning_rate": 1.6188782663618866e-05, + "loss": 1.0057, + "step": 1822 + }, + { + "epoch": 0.3091140313692243, + "grad_norm": 0.9288939176781582, + "learning_rate": 1.6184467604673843e-05, + "loss": 0.9775, + "step": 1823 + }, + { + "epoch": 0.3092835947435354, + "grad_norm": 0.9373969596353178, + "learning_rate": 1.6180150680165496e-05, + "loss": 0.9728, + "step": 1824 + }, + { + "epoch": 0.30945315811784657, + "grad_norm": 0.954207371612142, + "learning_rate": 1.6175831891396034e-05, + "loss": 0.9858, + "step": 1825 + }, + { + "epoch": 0.3096227214921577, + "grad_norm": 1.0573773356138934, + "learning_rate": 1.6171511239668233e-05, + "loss": 1.0212, + "step": 1826 + }, + { + "epoch": 0.30979228486646887, + "grad_norm": 0.9704153505792887, + "learning_rate": 1.6167188726285433e-05, + "loss": 0.9918, + "step": 1827 + }, + { + "epoch": 0.30996184824078, + "grad_norm": 1.060168869791843, + "learning_rate": 1.616286435255153e-05, + "loss": 1.0458, + "step": 1828 + }, + { + "epoch": 0.3101314116150911, + "grad_norm": 0.9382939266881261, + "learning_rate": 1.615853811977099e-05, + "loss": 0.9721, + "step": 1829 + }, + { + "epoch": 0.3103009749894023, + "grad_norm": 0.9258970755464889, + "learning_rate": 1.6154210029248826e-05, + "loss": 0.9888, + "step": 1830 + }, + { + "epoch": 0.3104705383637134, + "grad_norm": 1.0297541461285236, + "learning_rate": 1.6149880082290628e-05, + "loss": 1.021, + "step": 1831 + }, + { + "epoch": 0.3106401017380246, + "grad_norm": 0.6148328694730372, + "learning_rate": 1.614554828020253e-05, + "loss": 0.809, + "step": 1832 + }, + { + "epoch": 0.3108096651123357, + "grad_norm": 0.9751239243279715, + "learning_rate": 1.614121462429124e-05, + "loss": 0.9968, + "step": 1833 + }, + { + "epoch": 0.3109792284866469, + "grad_norm": 1.0179378075698877, + "learning_rate": 1.613687911586401e-05, + "loss": 1.0126, + "step": 1834 + }, + { + "epoch": 0.311148791860958, + "grad_norm": 0.9402075024967866, + "learning_rate": 1.613254175622867e-05, + "loss": 0.976, + "step": 1835 + }, + { + "epoch": 0.3113183552352692, + "grad_norm": 0.9823306599535597, + "learning_rate": 1.6128202546693592e-05, + "loss": 1.0025, + "step": 1836 + }, + { + "epoch": 0.3114879186095803, + "grad_norm": 0.9718703614639245, + "learning_rate": 1.612386148856771e-05, + "loss": 0.988, + "step": 1837 + }, + { + "epoch": 0.3116574819838915, + "grad_norm": 1.0411631714862228, + "learning_rate": 1.611951858316052e-05, + "loss": 1.0209, + "step": 1838 + }, + { + "epoch": 0.3118270453582026, + "grad_norm": 0.9481017910965698, + "learning_rate": 1.6115173831782072e-05, + "loss": 1.0047, + "step": 1839 + }, + { + "epoch": 0.3119966087325138, + "grad_norm": 0.9578261700583934, + "learning_rate": 1.611082723574297e-05, + "loss": 1.0319, + "step": 1840 + }, + { + "epoch": 0.3121661721068249, + "grad_norm": 0.9740167314483015, + "learning_rate": 1.6106478796354382e-05, + "loss": 1.016, + "step": 1841 + }, + { + "epoch": 0.3123357354811361, + "grad_norm": 0.9309827895982609, + "learning_rate": 1.6102128514928028e-05, + "loss": 1.0012, + "step": 1842 + }, + { + "epoch": 0.3125052988554472, + "grad_norm": 0.9711408564367686, + "learning_rate": 1.6097776392776182e-05, + "loss": 1.0011, + "step": 1843 + }, + { + "epoch": 0.3126748622297584, + "grad_norm": 0.6293486170572605, + "learning_rate": 1.6093422431211674e-05, + "loss": 0.8261, + "step": 1844 + }, + { + "epoch": 0.3128444256040695, + "grad_norm": 0.9811287307797799, + "learning_rate": 1.6089066631547893e-05, + "loss": 0.9897, + "step": 1845 + }, + { + "epoch": 0.3130139889783807, + "grad_norm": 0.9840862006734133, + "learning_rate": 1.608470899509877e-05, + "loss": 1.0017, + "step": 1846 + }, + { + "epoch": 0.3131835523526918, + "grad_norm": 0.9832207842358262, + "learning_rate": 1.608034952317881e-05, + "loss": 0.9903, + "step": 1847 + }, + { + "epoch": 0.313353115727003, + "grad_norm": 0.9857571817599534, + "learning_rate": 1.607598821710306e-05, + "loss": 0.979, + "step": 1848 + }, + { + "epoch": 0.3135226791013141, + "grad_norm": 1.0268107585271726, + "learning_rate": 1.6071625078187113e-05, + "loss": 1.0196, + "step": 1849 + }, + { + "epoch": 0.31369224247562527, + "grad_norm": 0.9818237677757576, + "learning_rate": 1.6067260107747133e-05, + "loss": 0.9966, + "step": 1850 + }, + { + "epoch": 0.3138618058499364, + "grad_norm": 0.9846225689782029, + "learning_rate": 1.6062893307099817e-05, + "loss": 0.9906, + "step": 1851 + }, + { + "epoch": 0.31403136922424757, + "grad_norm": 0.9631073592769925, + "learning_rate": 1.6058524677562428e-05, + "loss": 0.974, + "step": 1852 + }, + { + "epoch": 0.3142009325985587, + "grad_norm": 0.9426263049950635, + "learning_rate": 1.6054154220452776e-05, + "loss": 0.9805, + "step": 1853 + }, + { + "epoch": 0.31437049597286987, + "grad_norm": 0.9771935047093394, + "learning_rate": 1.6049781937089227e-05, + "loss": 0.9867, + "step": 1854 + }, + { + "epoch": 0.314540059347181, + "grad_norm": 0.9439378442735374, + "learning_rate": 1.6045407828790686e-05, + "loss": 0.9521, + "step": 1855 + }, + { + "epoch": 0.31470962272149217, + "grad_norm": 0.9940738226724326, + "learning_rate": 1.604103189687662e-05, + "loss": 0.9803, + "step": 1856 + }, + { + "epoch": 0.3148791860958033, + "grad_norm": 0.981022097967025, + "learning_rate": 1.6036654142667043e-05, + "loss": 0.9884, + "step": 1857 + }, + { + "epoch": 0.31504874947011446, + "grad_norm": 0.9763891018640374, + "learning_rate": 1.6032274567482514e-05, + "loss": 1.0455, + "step": 1858 + }, + { + "epoch": 0.3152183128444256, + "grad_norm": 0.9580214076727748, + "learning_rate": 1.602789317264415e-05, + "loss": 1.0031, + "step": 1859 + }, + { + "epoch": 0.31538787621873676, + "grad_norm": 0.949485377285947, + "learning_rate": 1.6023509959473608e-05, + "loss": 1.0197, + "step": 1860 + }, + { + "epoch": 0.3155574395930479, + "grad_norm": 0.9677420266974983, + "learning_rate": 1.6019124929293097e-05, + "loss": 1.0507, + "step": 1861 + }, + { + "epoch": 0.31572700296735906, + "grad_norm": 0.998869087670947, + "learning_rate": 1.6014738083425378e-05, + "loss": 0.9775, + "step": 1862 + }, + { + "epoch": 0.3158965663416702, + "grad_norm": 0.995821233756967, + "learning_rate": 1.6010349423193753e-05, + "loss": 0.9776, + "step": 1863 + }, + { + "epoch": 0.31606612971598136, + "grad_norm": 0.9431541526561014, + "learning_rate": 1.6005958949922077e-05, + "loss": 0.975, + "step": 1864 + }, + { + "epoch": 0.3162356930902925, + "grad_norm": 0.9605142782302432, + "learning_rate": 1.600156666493475e-05, + "loss": 0.997, + "step": 1865 + }, + { + "epoch": 0.31640525646460366, + "grad_norm": 1.0356940923737057, + "learning_rate": 1.5997172569556717e-05, + "loss": 1.0193, + "step": 1866 + }, + { + "epoch": 0.3165748198389148, + "grad_norm": 0.9794872450451078, + "learning_rate": 1.599277666511347e-05, + "loss": 1.028, + "step": 1867 + }, + { + "epoch": 0.31674438321322596, + "grad_norm": 0.9978961700610897, + "learning_rate": 1.5988378952931047e-05, + "loss": 1.0426, + "step": 1868 + }, + { + "epoch": 0.3169139465875371, + "grad_norm": 0.9465061263038369, + "learning_rate": 1.598397943433603e-05, + "loss": 1.0075, + "step": 1869 + }, + { + "epoch": 0.31708350996184825, + "grad_norm": 0.6579223000191161, + "learning_rate": 1.597957811065555e-05, + "loss": 0.8428, + "step": 1870 + }, + { + "epoch": 0.3172530733361594, + "grad_norm": 0.9982834175268934, + "learning_rate": 1.5975174983217273e-05, + "loss": 1.0353, + "step": 1871 + }, + { + "epoch": 0.31742263671047055, + "grad_norm": 0.6079058639237402, + "learning_rate": 1.5970770053349426e-05, + "loss": 0.8767, + "step": 1872 + }, + { + "epoch": 0.3175922000847817, + "grad_norm": 1.0574585881261511, + "learning_rate": 1.596636332238076e-05, + "loss": 0.975, + "step": 1873 + }, + { + "epoch": 0.31776176345909285, + "grad_norm": 0.9655541394825543, + "learning_rate": 1.5961954791640582e-05, + "loss": 1.0106, + "step": 1874 + }, + { + "epoch": 0.31793132683340397, + "grad_norm": 1.0048781481777413, + "learning_rate": 1.595754446245874e-05, + "loss": 0.9786, + "step": 1875 + }, + { + "epoch": 0.31810089020771515, + "grad_norm": 1.0063905949494099, + "learning_rate": 1.595313233616562e-05, + "loss": 1.0143, + "step": 1876 + }, + { + "epoch": 0.31827045358202627, + "grad_norm": 1.0406411948150756, + "learning_rate": 1.5948718414092163e-05, + "loss": 1.0399, + "step": 1877 + }, + { + "epoch": 0.31844001695633745, + "grad_norm": 0.9967238215167251, + "learning_rate": 1.5944302697569828e-05, + "loss": 0.9909, + "step": 1878 + }, + { + "epoch": 0.31860958033064857, + "grad_norm": 1.0230635091406037, + "learning_rate": 1.5939885187930636e-05, + "loss": 1.0189, + "step": 1879 + }, + { + "epoch": 0.31877914370495974, + "grad_norm": 0.946193375131637, + "learning_rate": 1.5935465886507143e-05, + "loss": 0.9927, + "step": 1880 + }, + { + "epoch": 0.31894870707927087, + "grad_norm": 1.0308349922358215, + "learning_rate": 1.593104479463244e-05, + "loss": 1.0148, + "step": 1881 + }, + { + "epoch": 0.31911827045358204, + "grad_norm": 1.0002855721720632, + "learning_rate": 1.592662191364017e-05, + "loss": 0.983, + "step": 1882 + }, + { + "epoch": 0.31928783382789316, + "grad_norm": 0.9676552935956418, + "learning_rate": 1.5922197244864503e-05, + "loss": 1.0065, + "step": 1883 + }, + { + "epoch": 0.31945739720220434, + "grad_norm": 1.0656128508421252, + "learning_rate": 1.5917770789640153e-05, + "loss": 1.0124, + "step": 1884 + }, + { + "epoch": 0.31962696057651546, + "grad_norm": 0.9732578143627179, + "learning_rate": 1.5913342549302378e-05, + "loss": 0.9807, + "step": 1885 + }, + { + "epoch": 0.31979652395082664, + "grad_norm": 0.9766799054330302, + "learning_rate": 1.590891252518697e-05, + "loss": 1.015, + "step": 1886 + }, + { + "epoch": 0.31996608732513776, + "grad_norm": 0.9792386309551184, + "learning_rate": 1.5904480718630252e-05, + "loss": 0.9719, + "step": 1887 + }, + { + "epoch": 0.32013565069944894, + "grad_norm": 1.0783478509657463, + "learning_rate": 1.59000471309691e-05, + "loss": 1.0309, + "step": 1888 + }, + { + "epoch": 0.32030521407376006, + "grad_norm": 0.9787900448683673, + "learning_rate": 1.5895611763540914e-05, + "loss": 1.0468, + "step": 1889 + }, + { + "epoch": 0.32047477744807124, + "grad_norm": 1.0192124868613712, + "learning_rate": 1.5891174617683635e-05, + "loss": 0.9936, + "step": 1890 + }, + { + "epoch": 0.32064434082238236, + "grad_norm": 0.9636750402412061, + "learning_rate": 1.588673569473575e-05, + "loss": 1.0222, + "step": 1891 + }, + { + "epoch": 0.32081390419669353, + "grad_norm": 0.9789679652829065, + "learning_rate": 1.5882294996036264e-05, + "loss": 1.0284, + "step": 1892 + }, + { + "epoch": 0.32098346757100465, + "grad_norm": 0.9885958581632905, + "learning_rate": 1.5877852522924733e-05, + "loss": 1.0412, + "step": 1893 + }, + { + "epoch": 0.32115303094531583, + "grad_norm": 0.9269950081507043, + "learning_rate": 1.5873408276741237e-05, + "loss": 1.0066, + "step": 1894 + }, + { + "epoch": 0.32132259431962695, + "grad_norm": 1.011142141345352, + "learning_rate": 1.5868962258826407e-05, + "loss": 1.0217, + "step": 1895 + }, + { + "epoch": 0.32149215769393813, + "grad_norm": 1.0098994425770482, + "learning_rate": 1.5864514470521383e-05, + "loss": 1.0075, + "step": 1896 + }, + { + "epoch": 0.32166172106824925, + "grad_norm": 0.9645736430012207, + "learning_rate": 1.5860064913167863e-05, + "loss": 1.0338, + "step": 1897 + }, + { + "epoch": 0.32183128444256043, + "grad_norm": 0.9558375423679968, + "learning_rate": 1.5855613588108067e-05, + "loss": 0.9943, + "step": 1898 + }, + { + "epoch": 0.32200084781687155, + "grad_norm": 1.0303935285419652, + "learning_rate": 1.585116049668475e-05, + "loss": 1.0291, + "step": 1899 + }, + { + "epoch": 0.3221704111911827, + "grad_norm": 1.0246587726385061, + "learning_rate": 1.5846705640241206e-05, + "loss": 1.0297, + "step": 1900 + }, + { + "epoch": 0.32233997456549385, + "grad_norm": 0.9905973664457313, + "learning_rate": 1.584224902012125e-05, + "loss": 0.9942, + "step": 1901 + }, + { + "epoch": 0.322509537939805, + "grad_norm": 0.9821434420253609, + "learning_rate": 1.5837790637669237e-05, + "loss": 1.0012, + "step": 1902 + }, + { + "epoch": 0.32267910131411615, + "grad_norm": 0.9714831890867112, + "learning_rate": 1.583333049423005e-05, + "loss": 0.9968, + "step": 1903 + }, + { + "epoch": 0.3228486646884273, + "grad_norm": 1.0450880483602858, + "learning_rate": 1.5828868591149104e-05, + "loss": 1.0139, + "step": 1904 + }, + { + "epoch": 0.32301822806273844, + "grad_norm": 1.0362206270140941, + "learning_rate": 1.5824404929772347e-05, + "loss": 1.0109, + "step": 1905 + }, + { + "epoch": 0.3231877914370496, + "grad_norm": 1.0683406522143062, + "learning_rate": 1.581993951144626e-05, + "loss": 1.0451, + "step": 1906 + }, + { + "epoch": 0.32335735481136074, + "grad_norm": 0.9874279668704485, + "learning_rate": 1.5815472337517843e-05, + "loss": 0.9901, + "step": 1907 + }, + { + "epoch": 0.3235269181856719, + "grad_norm": 0.9420503168241291, + "learning_rate": 1.5811003409334635e-05, + "loss": 1.0025, + "step": 1908 + }, + { + "epoch": 0.32369648155998304, + "grad_norm": 0.9433224210660105, + "learning_rate": 1.5806532728244707e-05, + "loss": 0.9723, + "step": 1909 + }, + { + "epoch": 0.3238660449342942, + "grad_norm": 1.0637555733920365, + "learning_rate": 1.5802060295596643e-05, + "loss": 1.0224, + "step": 1910 + }, + { + "epoch": 0.32403560830860534, + "grad_norm": 0.9544715632346825, + "learning_rate": 1.5797586112739575e-05, + "loss": 0.9786, + "step": 1911 + }, + { + "epoch": 0.3242051716829165, + "grad_norm": 1.003272481054091, + "learning_rate": 1.579311018102315e-05, + "loss": 1.0051, + "step": 1912 + }, + { + "epoch": 0.32437473505722764, + "grad_norm": 1.012267656390877, + "learning_rate": 1.5788632501797545e-05, + "loss": 1.0374, + "step": 1913 + }, + { + "epoch": 0.3245442984315388, + "grad_norm": 0.9626360713826805, + "learning_rate": 1.578415307641347e-05, + "loss": 1.047, + "step": 1914 + }, + { + "epoch": 0.32471386180584993, + "grad_norm": 0.987019021841255, + "learning_rate": 1.577967190622215e-05, + "loss": 0.9992, + "step": 1915 + }, + { + "epoch": 0.3248834251801611, + "grad_norm": 1.0597031415155116, + "learning_rate": 1.577518899257535e-05, + "loss": 1.0092, + "step": 1916 + }, + { + "epoch": 0.32505298855447223, + "grad_norm": 1.0089623696987418, + "learning_rate": 1.5770704336825354e-05, + "loss": 1.0429, + "step": 1917 + }, + { + "epoch": 0.3252225519287834, + "grad_norm": 0.9276868410811404, + "learning_rate": 1.576621794032497e-05, + "loss": 0.9488, + "step": 1918 + }, + { + "epoch": 0.32539211530309453, + "grad_norm": 1.0549749916794338, + "learning_rate": 1.576172980442753e-05, + "loss": 1.0088, + "step": 1919 + }, + { + "epoch": 0.3255616786774057, + "grad_norm": 0.9748917422847581, + "learning_rate": 1.57572399304869e-05, + "loss": 1.0183, + "step": 1920 + }, + { + "epoch": 0.32573124205171683, + "grad_norm": 1.0166603459142411, + "learning_rate": 1.575274831985746e-05, + "loss": 1.0074, + "step": 1921 + }, + { + "epoch": 0.32590080542602795, + "grad_norm": 1.0386849761493508, + "learning_rate": 1.574825497389412e-05, + "loss": 1.0136, + "step": 1922 + }, + { + "epoch": 0.3260703688003391, + "grad_norm": 1.0355082073267796, + "learning_rate": 1.5743759893952306e-05, + "loss": 1.0294, + "step": 1923 + }, + { + "epoch": 0.32623993217465025, + "grad_norm": 0.975107797332124, + "learning_rate": 1.5739263081387983e-05, + "loss": 1.0054, + "step": 1924 + }, + { + "epoch": 0.3264094955489614, + "grad_norm": 1.0073725273436516, + "learning_rate": 1.5734764537557617e-05, + "loss": 0.9822, + "step": 1925 + }, + { + "epoch": 0.32657905892327255, + "grad_norm": 1.0102409530630205, + "learning_rate": 1.5730264263818212e-05, + "loss": 1.022, + "step": 1926 + }, + { + "epoch": 0.3267486222975837, + "grad_norm": 1.0297794794168857, + "learning_rate": 1.5725762261527295e-05, + "loss": 0.9982, + "step": 1927 + }, + { + "epoch": 0.32691818567189485, + "grad_norm": 1.012319423695426, + "learning_rate": 1.57212585320429e-05, + "loss": 1.0259, + "step": 1928 + }, + { + "epoch": 0.327087749046206, + "grad_norm": 0.9621073037164247, + "learning_rate": 1.5716753076723594e-05, + "loss": 0.9813, + "step": 1929 + }, + { + "epoch": 0.32725731242051714, + "grad_norm": 1.0437756761144823, + "learning_rate": 1.5712245896928458e-05, + "loss": 1.0284, + "step": 1930 + }, + { + "epoch": 0.3274268757948283, + "grad_norm": 1.0027007083831454, + "learning_rate": 1.57077369940171e-05, + "loss": 1.0078, + "step": 1931 + }, + { + "epoch": 0.32759643916913944, + "grad_norm": 0.9622145131219082, + "learning_rate": 1.5703226369349642e-05, + "loss": 0.9976, + "step": 1932 + }, + { + "epoch": 0.3277660025434506, + "grad_norm": 0.9694541250884402, + "learning_rate": 1.5698714024286733e-05, + "loss": 1.0035, + "step": 1933 + }, + { + "epoch": 0.32793556591776174, + "grad_norm": 0.9567664175030857, + "learning_rate": 1.5694199960189526e-05, + "loss": 1.0125, + "step": 1934 + }, + { + "epoch": 0.3281051292920729, + "grad_norm": 0.9968959199031922, + "learning_rate": 1.568968417841971e-05, + "loss": 1.0129, + "step": 1935 + }, + { + "epoch": 0.32827469266638404, + "grad_norm": 0.9311772765969382, + "learning_rate": 1.5685166680339483e-05, + "loss": 0.9574, + "step": 1936 + }, + { + "epoch": 0.3284442560406952, + "grad_norm": 1.0216924103822678, + "learning_rate": 1.568064746731156e-05, + "loss": 0.998, + "step": 1937 + }, + { + "epoch": 0.32861381941500634, + "grad_norm": 1.024410162571907, + "learning_rate": 1.5676126540699174e-05, + "loss": 1.0365, + "step": 1938 + }, + { + "epoch": 0.3287833827893175, + "grad_norm": 1.0090352555077087, + "learning_rate": 1.5671603901866078e-05, + "loss": 0.9918, + "step": 1939 + }, + { + "epoch": 0.32895294616362863, + "grad_norm": 0.9674802350477768, + "learning_rate": 1.5667079552176544e-05, + "loss": 1.0133, + "step": 1940 + }, + { + "epoch": 0.3291225095379398, + "grad_norm": 0.9472013801043612, + "learning_rate": 1.5662553492995347e-05, + "loss": 0.9959, + "step": 1941 + }, + { + "epoch": 0.32929207291225093, + "grad_norm": 0.9957464801894853, + "learning_rate": 1.5658025725687793e-05, + "loss": 0.9866, + "step": 1942 + }, + { + "epoch": 0.3294616362865621, + "grad_norm": 1.0001120207477021, + "learning_rate": 1.5653496251619693e-05, + "loss": 0.9937, + "step": 1943 + }, + { + "epoch": 0.32963119966087323, + "grad_norm": 0.7111223088466704, + "learning_rate": 1.5648965072157378e-05, + "loss": 0.9294, + "step": 1944 + }, + { + "epoch": 0.3298007630351844, + "grad_norm": 0.993306192326719, + "learning_rate": 1.5644432188667695e-05, + "loss": 1.0249, + "step": 1945 + }, + { + "epoch": 0.32997032640949553, + "grad_norm": 1.0249390328546526, + "learning_rate": 1.5639897602517996e-05, + "loss": 1.0126, + "step": 1946 + }, + { + "epoch": 0.3301398897838067, + "grad_norm": 1.058227942444184, + "learning_rate": 1.5635361315076157e-05, + "loss": 1.0492, + "step": 1947 + }, + { + "epoch": 0.3303094531581178, + "grad_norm": 0.9888870900140873, + "learning_rate": 1.5630823327710558e-05, + "loss": 1.0094, + "step": 1948 + }, + { + "epoch": 0.330479016532429, + "grad_norm": 0.9357541722164157, + "learning_rate": 1.5626283641790105e-05, + "loss": 0.9903, + "step": 1949 + }, + { + "epoch": 0.3306485799067401, + "grad_norm": 0.6653962932999141, + "learning_rate": 1.56217422586842e-05, + "loss": 0.8853, + "step": 1950 + }, + { + "epoch": 0.3308181432810513, + "grad_norm": 0.9647512499555115, + "learning_rate": 1.5617199179762767e-05, + "loss": 1.0045, + "step": 1951 + }, + { + "epoch": 0.3309877066553624, + "grad_norm": 1.0420341363389451, + "learning_rate": 1.561265440639624e-05, + "loss": 1.0526, + "step": 1952 + }, + { + "epoch": 0.3311572700296736, + "grad_norm": 1.022327382221898, + "learning_rate": 1.5608107939955565e-05, + "loss": 0.9993, + "step": 1953 + }, + { + "epoch": 0.3313268334039847, + "grad_norm": 1.0567423133947502, + "learning_rate": 1.56035597818122e-05, + "loss": 1.0084, + "step": 1954 + }, + { + "epoch": 0.3314963967782959, + "grad_norm": 0.9987288255497635, + "learning_rate": 1.5599009933338102e-05, + "loss": 1.0072, + "step": 1955 + }, + { + "epoch": 0.331665960152607, + "grad_norm": 0.9767128024010041, + "learning_rate": 1.5594458395905754e-05, + "loss": 1.0091, + "step": 1956 + }, + { + "epoch": 0.3318355235269182, + "grad_norm": 1.0721425878700144, + "learning_rate": 1.5589905170888136e-05, + "loss": 1.0127, + "step": 1957 + }, + { + "epoch": 0.3320050869012293, + "grad_norm": 1.043154130609934, + "learning_rate": 1.5585350259658748e-05, + "loss": 1.0033, + "step": 1958 + }, + { + "epoch": 0.3321746502755405, + "grad_norm": 0.9883145360312716, + "learning_rate": 1.5580793663591583e-05, + "loss": 0.9702, + "step": 1959 + }, + { + "epoch": 0.3323442136498516, + "grad_norm": 1.0099266871840573, + "learning_rate": 1.5576235384061168e-05, + "loss": 0.9716, + "step": 1960 + }, + { + "epoch": 0.3325137770241628, + "grad_norm": 0.9699130377267756, + "learning_rate": 1.5571675422442504e-05, + "loss": 0.987, + "step": 1961 + }, + { + "epoch": 0.3326833403984739, + "grad_norm": 0.9592318073429286, + "learning_rate": 1.5567113780111128e-05, + "loss": 0.9793, + "step": 1962 + }, + { + "epoch": 0.3328529037727851, + "grad_norm": 0.9688505927390096, + "learning_rate": 1.556255045844307e-05, + "loss": 1.0072, + "step": 1963 + }, + { + "epoch": 0.3330224671470962, + "grad_norm": 1.0134694831162991, + "learning_rate": 1.555798545881487e-05, + "loss": 1.0319, + "step": 1964 + }, + { + "epoch": 0.3331920305214074, + "grad_norm": 0.9870003333639381, + "learning_rate": 1.5553418782603574e-05, + "loss": 1.0046, + "step": 1965 + }, + { + "epoch": 0.3333615938957185, + "grad_norm": 0.9966508367407503, + "learning_rate": 1.5548850431186732e-05, + "loss": 1.0116, + "step": 1966 + }, + { + "epoch": 0.3335311572700297, + "grad_norm": 1.1014023281728036, + "learning_rate": 1.554428040594241e-05, + "loss": 0.9934, + "step": 1967 + }, + { + "epoch": 0.3337007206443408, + "grad_norm": 0.9642444724824197, + "learning_rate": 1.5539708708249155e-05, + "loss": 1.0289, + "step": 1968 + }, + { + "epoch": 0.333870284018652, + "grad_norm": 0.9664566647450222, + "learning_rate": 1.5535135339486044e-05, + "loss": 0.9852, + "step": 1969 + }, + { + "epoch": 0.3340398473929631, + "grad_norm": 1.0395932237438719, + "learning_rate": 1.5530560301032644e-05, + "loss": 1.0025, + "step": 1970 + }, + { + "epoch": 0.3342094107672743, + "grad_norm": 1.0041883169749564, + "learning_rate": 1.5525983594269026e-05, + "loss": 0.9601, + "step": 1971 + }, + { + "epoch": 0.3343789741415854, + "grad_norm": 0.9948895018405378, + "learning_rate": 1.552140522057578e-05, + "loss": 1.0054, + "step": 1972 + }, + { + "epoch": 0.3345485375158966, + "grad_norm": 1.0085855924292748, + "learning_rate": 1.5516825181333967e-05, + "loss": 1.0138, + "step": 1973 + }, + { + "epoch": 0.3347181008902077, + "grad_norm": 0.9969111700914716, + "learning_rate": 1.551224347792519e-05, + "loss": 0.9863, + "step": 1974 + }, + { + "epoch": 0.3348876642645189, + "grad_norm": 1.0078960859634392, + "learning_rate": 1.5507660111731514e-05, + "loss": 0.9961, + "step": 1975 + }, + { + "epoch": 0.33505722763883, + "grad_norm": 1.012636963165273, + "learning_rate": 1.550307508413554e-05, + "loss": 1.0047, + "step": 1976 + }, + { + "epoch": 0.3352267910131412, + "grad_norm": 1.064933940454274, + "learning_rate": 1.549848839652035e-05, + "loss": 1.0522, + "step": 1977 + }, + { + "epoch": 0.3353963543874523, + "grad_norm": 0.9858142553685583, + "learning_rate": 1.5493900050269536e-05, + "loss": 0.9676, + "step": 1978 + }, + { + "epoch": 0.3355659177617635, + "grad_norm": 0.9695329457844212, + "learning_rate": 1.5489310046767178e-05, + "loss": 0.9619, + "step": 1979 + }, + { + "epoch": 0.3357354811360746, + "grad_norm": 0.9898193815478287, + "learning_rate": 1.548471838739787e-05, + "loss": 1.0095, + "step": 1980 + }, + { + "epoch": 0.3359050445103858, + "grad_norm": 0.9616954499602726, + "learning_rate": 1.5480125073546705e-05, + "loss": 1.0091, + "step": 1981 + }, + { + "epoch": 0.3360746078846969, + "grad_norm": 0.9923947777580014, + "learning_rate": 1.5475530106599256e-05, + "loss": 0.985, + "step": 1982 + }, + { + "epoch": 0.3362441712590081, + "grad_norm": 1.041335852187222, + "learning_rate": 1.547093348794162e-05, + "loss": 1.0108, + "step": 1983 + }, + { + "epoch": 0.3364137346333192, + "grad_norm": 0.9669472264361062, + "learning_rate": 1.546633521896038e-05, + "loss": 0.9785, + "step": 1984 + }, + { + "epoch": 0.33658329800763037, + "grad_norm": 0.9881989601125073, + "learning_rate": 1.5461735301042615e-05, + "loss": 1.0171, + "step": 1985 + }, + { + "epoch": 0.3367528613819415, + "grad_norm": 0.9854037263011841, + "learning_rate": 1.5457133735575905e-05, + "loss": 1.0127, + "step": 1986 + }, + { + "epoch": 0.33692242475625267, + "grad_norm": 0.9742649102909576, + "learning_rate": 1.5452530523948323e-05, + "loss": 1.019, + "step": 1987 + }, + { + "epoch": 0.3370919881305638, + "grad_norm": 1.0264095263409634, + "learning_rate": 1.5447925667548448e-05, + "loss": 1.0525, + "step": 1988 + }, + { + "epoch": 0.33726155150487497, + "grad_norm": 0.9536302093637906, + "learning_rate": 1.5443319167765345e-05, + "loss": 0.9977, + "step": 1989 + }, + { + "epoch": 0.3374311148791861, + "grad_norm": 0.9882925682687181, + "learning_rate": 1.543871102598858e-05, + "loss": 1.006, + "step": 1990 + }, + { + "epoch": 0.33760067825349727, + "grad_norm": 0.9982911558620327, + "learning_rate": 1.5434101243608205e-05, + "loss": 1.0101, + "step": 1991 + }, + { + "epoch": 0.3377702416278084, + "grad_norm": 0.9984375035046159, + "learning_rate": 1.542948982201479e-05, + "loss": 1.0205, + "step": 1992 + }, + { + "epoch": 0.33793980500211956, + "grad_norm": 0.9901329379999525, + "learning_rate": 1.542487676259937e-05, + "loss": 0.9798, + "step": 1993 + }, + { + "epoch": 0.3381093683764307, + "grad_norm": 0.9204799605729618, + "learning_rate": 1.54202620667535e-05, + "loss": 0.9613, + "step": 1994 + }, + { + "epoch": 0.33827893175074186, + "grad_norm": 0.9564591902100879, + "learning_rate": 1.5415645735869206e-05, + "loss": 0.9798, + "step": 1995 + }, + { + "epoch": 0.338448495125053, + "grad_norm": 0.9855650280510012, + "learning_rate": 1.5411027771339023e-05, + "loss": 0.9648, + "step": 1996 + }, + { + "epoch": 0.33861805849936416, + "grad_norm": 0.9783289004141156, + "learning_rate": 1.5406408174555978e-05, + "loss": 0.9819, + "step": 1997 + }, + { + "epoch": 0.3387876218736753, + "grad_norm": 1.0178840449502158, + "learning_rate": 1.540178694691358e-05, + "loss": 0.9668, + "step": 1998 + }, + { + "epoch": 0.33895718524798646, + "grad_norm": 0.91657579096201, + "learning_rate": 1.5397164089805842e-05, + "loss": 0.9902, + "step": 1999 + }, + { + "epoch": 0.3391267486222976, + "grad_norm": 0.9461462072822882, + "learning_rate": 1.5392539604627255e-05, + "loss": 0.9868, + "step": 2000 + }, + { + "epoch": 0.33929631199660876, + "grad_norm": 0.9688217491830641, + "learning_rate": 1.5387913492772816e-05, + "loss": 0.9971, + "step": 2001 + }, + { + "epoch": 0.3394658753709199, + "grad_norm": 0.9558164219939712, + "learning_rate": 1.5383285755638002e-05, + "loss": 1.016, + "step": 2002 + }, + { + "epoch": 0.33963543874523106, + "grad_norm": 0.9623546503858995, + "learning_rate": 1.5378656394618788e-05, + "loss": 0.9913, + "step": 2003 + }, + { + "epoch": 0.3398050021195422, + "grad_norm": 1.050059224730583, + "learning_rate": 1.537402541111163e-05, + "loss": 1.0203, + "step": 2004 + }, + { + "epoch": 0.33997456549385335, + "grad_norm": 0.9992401293860915, + "learning_rate": 1.536939280651348e-05, + "loss": 0.9996, + "step": 2005 + }, + { + "epoch": 0.3401441288681645, + "grad_norm": 0.9882078907203777, + "learning_rate": 1.5364758582221775e-05, + "loss": 1.0001, + "step": 2006 + }, + { + "epoch": 0.34031369224247565, + "grad_norm": 0.9250831840923877, + "learning_rate": 1.5360122739634444e-05, + "loss": 0.9646, + "step": 2007 + }, + { + "epoch": 0.3404832556167868, + "grad_norm": 1.005415331249681, + "learning_rate": 1.5355485280149908e-05, + "loss": 1.0214, + "step": 2008 + }, + { + "epoch": 0.34065281899109795, + "grad_norm": 0.9172486724632182, + "learning_rate": 1.5350846205167065e-05, + "loss": 0.9444, + "step": 2009 + }, + { + "epoch": 0.34082238236540907, + "grad_norm": 0.9613513738656416, + "learning_rate": 1.5346205516085305e-05, + "loss": 1.0225, + "step": 2010 + }, + { + "epoch": 0.34099194573972025, + "grad_norm": 1.044286141829047, + "learning_rate": 1.534156321430451e-05, + "loss": 1.0439, + "step": 2011 + }, + { + "epoch": 0.34116150911403137, + "grad_norm": 0.9825547205433247, + "learning_rate": 1.5336919301225042e-05, + "loss": 0.9979, + "step": 2012 + }, + { + "epoch": 0.3413310724883425, + "grad_norm": 0.9699608596113349, + "learning_rate": 1.533227377824775e-05, + "loss": 1.0053, + "step": 2013 + }, + { + "epoch": 0.34150063586265367, + "grad_norm": 1.0081056256837027, + "learning_rate": 1.5327626646773975e-05, + "loss": 1.0248, + "step": 2014 + }, + { + "epoch": 0.3416701992369648, + "grad_norm": 0.9841112820981301, + "learning_rate": 1.5322977908205537e-05, + "loss": 0.9888, + "step": 2015 + }, + { + "epoch": 0.34183976261127597, + "grad_norm": 0.9427539140869523, + "learning_rate": 1.531832756394474e-05, + "loss": 1.0081, + "step": 2016 + }, + { + "epoch": 0.3420093259855871, + "grad_norm": 0.9850126420134957, + "learning_rate": 1.5313675615394373e-05, + "loss": 1.006, + "step": 2017 + }, + { + "epoch": 0.34217888935989826, + "grad_norm": 0.9432197837408135, + "learning_rate": 1.5309022063957712e-05, + "loss": 1.0102, + "step": 2018 + }, + { + "epoch": 0.3423484527342094, + "grad_norm": 0.9933170323915388, + "learning_rate": 1.5304366911038517e-05, + "loss": 1.0343, + "step": 2019 + }, + { + "epoch": 0.34251801610852056, + "grad_norm": 1.0039903386596305, + "learning_rate": 1.5299710158041023e-05, + "loss": 1.0212, + "step": 2020 + }, + { + "epoch": 0.3426875794828317, + "grad_norm": 0.9870840459336154, + "learning_rate": 1.529505180636996e-05, + "loss": 1.0015, + "step": 2021 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 1.0071131021616893, + "learning_rate": 1.5290391857430528e-05, + "loss": 1.0019, + "step": 2022 + }, + { + "epoch": 0.343026706231454, + "grad_norm": 0.9786450318999963, + "learning_rate": 1.528573031262842e-05, + "loss": 1.0149, + "step": 2023 + }, + { + "epoch": 0.34319626960576516, + "grad_norm": 0.974870653942427, + "learning_rate": 1.5281067173369804e-05, + "loss": 0.9823, + "step": 2024 + }, + { + "epoch": 0.3433658329800763, + "grad_norm": 0.9942748834805935, + "learning_rate": 1.527640244106133e-05, + "loss": 0.9934, + "step": 2025 + }, + { + "epoch": 0.34353539635438746, + "grad_norm": 0.9764450285933127, + "learning_rate": 1.5271736117110125e-05, + "loss": 1.0121, + "step": 2026 + }, + { + "epoch": 0.3437049597286986, + "grad_norm": 0.9956403755146589, + "learning_rate": 1.5267068202923802e-05, + "loss": 1.0196, + "step": 2027 + }, + { + "epoch": 0.34387452310300975, + "grad_norm": 0.9646787279914002, + "learning_rate": 1.5262398699910456e-05, + "loss": 1.0145, + "step": 2028 + }, + { + "epoch": 0.3440440864773209, + "grad_norm": 0.9476877958294668, + "learning_rate": 1.5257727609478647e-05, + "loss": 1.015, + "step": 2029 + }, + { + "epoch": 0.34421364985163205, + "grad_norm": 0.9371017637390845, + "learning_rate": 1.5253054933037433e-05, + "loss": 0.9834, + "step": 2030 + }, + { + "epoch": 0.3443832132259432, + "grad_norm": 1.0200467530193926, + "learning_rate": 1.5248380671996333e-05, + "loss": 1.0087, + "step": 2031 + }, + { + "epoch": 0.34455277660025435, + "grad_norm": 0.9478642311363885, + "learning_rate": 1.5243704827765355e-05, + "loss": 0.9645, + "step": 2032 + }, + { + "epoch": 0.3447223399745655, + "grad_norm": 0.9849038729822642, + "learning_rate": 1.5239027401754987e-05, + "loss": 1.0299, + "step": 2033 + }, + { + "epoch": 0.34489190334887665, + "grad_norm": 1.008066785172618, + "learning_rate": 1.5234348395376181e-05, + "loss": 0.9889, + "step": 2034 + }, + { + "epoch": 0.34506146672318777, + "grad_norm": 0.9890629169110476, + "learning_rate": 1.522966781004038e-05, + "loss": 1.006, + "step": 2035 + }, + { + "epoch": 0.34523103009749895, + "grad_norm": 0.9892380352512955, + "learning_rate": 1.5224985647159489e-05, + "loss": 0.9774, + "step": 2036 + }, + { + "epoch": 0.34540059347181007, + "grad_norm": 0.934807228595409, + "learning_rate": 1.5220301908145905e-05, + "loss": 0.9482, + "step": 2037 + }, + { + "epoch": 0.34557015684612125, + "grad_norm": 0.9958996659542763, + "learning_rate": 1.5215616594412483e-05, + "loss": 1.0081, + "step": 2038 + }, + { + "epoch": 0.34573972022043237, + "grad_norm": 0.9489549153343333, + "learning_rate": 1.521092970737257e-05, + "loss": 0.9583, + "step": 2039 + }, + { + "epoch": 0.34590928359474354, + "grad_norm": 0.9745619440493434, + "learning_rate": 1.5206241248439977e-05, + "loss": 0.9813, + "step": 2040 + }, + { + "epoch": 0.34607884696905467, + "grad_norm": 0.9588406776363732, + "learning_rate": 1.5201551219028988e-05, + "loss": 0.9711, + "step": 2041 + }, + { + "epoch": 0.34624841034336584, + "grad_norm": 1.0720060950984562, + "learning_rate": 1.519685962055437e-05, + "loss": 1.0221, + "step": 2042 + }, + { + "epoch": 0.34641797371767696, + "grad_norm": 0.9601982126000166, + "learning_rate": 1.5192166454431357e-05, + "loss": 0.9851, + "step": 2043 + }, + { + "epoch": 0.34658753709198814, + "grad_norm": 0.9778824941821028, + "learning_rate": 1.518747172207565e-05, + "loss": 1.0103, + "step": 2044 + }, + { + "epoch": 0.34675710046629926, + "grad_norm": 0.9599923893417894, + "learning_rate": 1.5182775424903437e-05, + "loss": 0.9766, + "step": 2045 + }, + { + "epoch": 0.34692666384061044, + "grad_norm": 0.9582586657866646, + "learning_rate": 1.5178077564331371e-05, + "loss": 1.0008, + "step": 2046 + }, + { + "epoch": 0.34709622721492156, + "grad_norm": 0.9797476428419623, + "learning_rate": 1.5173378141776569e-05, + "loss": 1.0216, + "step": 2047 + }, + { + "epoch": 0.34726579058923274, + "grad_norm": 0.9814825111743928, + "learning_rate": 1.5168677158656633e-05, + "loss": 0.9631, + "step": 2048 + }, + { + "epoch": 0.34743535396354386, + "grad_norm": 0.975072724699753, + "learning_rate": 1.5163974616389621e-05, + "loss": 1.0008, + "step": 2049 + }, + { + "epoch": 0.34760491733785503, + "grad_norm": 0.9931577930713837, + "learning_rate": 1.5159270516394074e-05, + "loss": 0.9714, + "step": 2050 + }, + { + "epoch": 0.34777448071216616, + "grad_norm": 0.9687591244382638, + "learning_rate": 1.5154564860088998e-05, + "loss": 1.0194, + "step": 2051 + }, + { + "epoch": 0.34794404408647733, + "grad_norm": 0.9486807395019651, + "learning_rate": 1.5149857648893871e-05, + "loss": 1.0082, + "step": 2052 + }, + { + "epoch": 0.34811360746078845, + "grad_norm": 0.9599413755492713, + "learning_rate": 1.514514888422863e-05, + "loss": 0.9723, + "step": 2053 + }, + { + "epoch": 0.34828317083509963, + "grad_norm": 1.0240880542834017, + "learning_rate": 1.5140438567513695e-05, + "loss": 1.0041, + "step": 2054 + }, + { + "epoch": 0.34845273420941075, + "grad_norm": 0.9789204918194641, + "learning_rate": 1.5135726700169944e-05, + "loss": 1.0003, + "step": 2055 + }, + { + "epoch": 0.34862229758372193, + "grad_norm": 0.9653717275377449, + "learning_rate": 1.5131013283618725e-05, + "loss": 0.9981, + "step": 2056 + }, + { + "epoch": 0.34879186095803305, + "grad_norm": 0.9728103891927375, + "learning_rate": 1.5126298319281859e-05, + "loss": 0.9858, + "step": 2057 + }, + { + "epoch": 0.3489614243323442, + "grad_norm": 1.01168269180722, + "learning_rate": 1.5121581808581623e-05, + "loss": 0.9705, + "step": 2058 + }, + { + "epoch": 0.34913098770665535, + "grad_norm": 0.9295484813073963, + "learning_rate": 1.511686375294077e-05, + "loss": 1.0018, + "step": 2059 + }, + { + "epoch": 0.3493005510809665, + "grad_norm": 1.010828751609662, + "learning_rate": 1.5112144153782517e-05, + "loss": 1.0231, + "step": 2060 + }, + { + "epoch": 0.34947011445527765, + "grad_norm": 0.9385574049809434, + "learning_rate": 1.5107423012530546e-05, + "loss": 0.9963, + "step": 2061 + }, + { + "epoch": 0.3496396778295888, + "grad_norm": 0.9958651528186389, + "learning_rate": 1.5102700330609e-05, + "loss": 0.9962, + "step": 2062 + }, + { + "epoch": 0.34980924120389995, + "grad_norm": 0.9615275481888363, + "learning_rate": 1.5097976109442489e-05, + "loss": 0.9946, + "step": 2063 + }, + { + "epoch": 0.3499788045782111, + "grad_norm": 0.9947243347240171, + "learning_rate": 1.5093250350456092e-05, + "loss": 1.0012, + "step": 2064 + }, + { + "epoch": 0.35014836795252224, + "grad_norm": 0.9827765460141319, + "learning_rate": 1.508852305507535e-05, + "loss": 0.9935, + "step": 2065 + }, + { + "epoch": 0.3503179313268334, + "grad_norm": 0.9645647840904965, + "learning_rate": 1.5083794224726262e-05, + "loss": 0.9634, + "step": 2066 + }, + { + "epoch": 0.35048749470114454, + "grad_norm": 0.9492096519486234, + "learning_rate": 1.5079063860835295e-05, + "loss": 0.9779, + "step": 2067 + }, + { + "epoch": 0.3506570580754557, + "grad_norm": 0.9734174689268947, + "learning_rate": 1.507433196482938e-05, + "loss": 0.9702, + "step": 2068 + }, + { + "epoch": 0.35082662144976684, + "grad_norm": 0.9264518521913037, + "learning_rate": 1.5069598538135905e-05, + "loss": 1.0106, + "step": 2069 + }, + { + "epoch": 0.350996184824078, + "grad_norm": 0.9623859365502476, + "learning_rate": 1.5064863582182725e-05, + "loss": 0.9733, + "step": 2070 + }, + { + "epoch": 0.35116574819838914, + "grad_norm": 0.9905612849049856, + "learning_rate": 1.5060127098398151e-05, + "loss": 1.0115, + "step": 2071 + }, + { + "epoch": 0.3513353115727003, + "grad_norm": 0.9934109799673223, + "learning_rate": 1.505538908821096e-05, + "loss": 0.9464, + "step": 2072 + }, + { + "epoch": 0.35150487494701144, + "grad_norm": 0.9538267705770282, + "learning_rate": 1.5050649553050383e-05, + "loss": 0.9873, + "step": 2073 + }, + { + "epoch": 0.3516744383213226, + "grad_norm": 0.9340991033226421, + "learning_rate": 1.5045908494346117e-05, + "loss": 0.9821, + "step": 2074 + }, + { + "epoch": 0.35184400169563373, + "grad_norm": 0.9300807794171914, + "learning_rate": 1.504116591352832e-05, + "loss": 0.9601, + "step": 2075 + }, + { + "epoch": 0.3520135650699449, + "grad_norm": 0.9796421579581944, + "learning_rate": 1.5036421812027604e-05, + "loss": 0.955, + "step": 2076 + }, + { + "epoch": 0.35218312844425603, + "grad_norm": 0.9963431326293527, + "learning_rate": 1.503167619127504e-05, + "loss": 1.0051, + "step": 2077 + }, + { + "epoch": 0.3523526918185672, + "grad_norm": 1.022777336166867, + "learning_rate": 1.5026929052702159e-05, + "loss": 1.0482, + "step": 2078 + }, + { + "epoch": 0.35252225519287833, + "grad_norm": 1.003562436958779, + "learning_rate": 1.5022180397740953e-05, + "loss": 0.9912, + "step": 2079 + }, + { + "epoch": 0.3526918185671895, + "grad_norm": 0.9941493414383453, + "learning_rate": 1.5017430227823867e-05, + "loss": 1.0117, + "step": 2080 + }, + { + "epoch": 0.35286138194150063, + "grad_norm": 0.9738283524302663, + "learning_rate": 1.5012678544383797e-05, + "loss": 0.9876, + "step": 2081 + }, + { + "epoch": 0.3530309453158118, + "grad_norm": 0.6291859971830636, + "learning_rate": 1.5007925348854112e-05, + "loss": 0.7942, + "step": 2082 + }, + { + "epoch": 0.3532005086901229, + "grad_norm": 0.9709256770650636, + "learning_rate": 1.500317064266862e-05, + "loss": 0.9697, + "step": 2083 + }, + { + "epoch": 0.3533700720644341, + "grad_norm": 0.9826143611178059, + "learning_rate": 1.4998414427261601e-05, + "loss": 0.9948, + "step": 2084 + }, + { + "epoch": 0.3535396354387452, + "grad_norm": 1.0725279959117793, + "learning_rate": 1.4993656704067777e-05, + "loss": 0.9961, + "step": 2085 + }, + { + "epoch": 0.3537091988130564, + "grad_norm": 0.9810808774283746, + "learning_rate": 1.4988897474522328e-05, + "loss": 0.9949, + "step": 2086 + }, + { + "epoch": 0.3538787621873675, + "grad_norm": 0.9780484660843558, + "learning_rate": 1.4984136740060893e-05, + "loss": 0.9921, + "step": 2087 + }, + { + "epoch": 0.3540483255616787, + "grad_norm": 1.0205286753424472, + "learning_rate": 1.497937450211956e-05, + "loss": 1.027, + "step": 2088 + }, + { + "epoch": 0.3542178889359898, + "grad_norm": 0.9531606724233822, + "learning_rate": 1.4974610762134875e-05, + "loss": 0.9724, + "step": 2089 + }, + { + "epoch": 0.354387452310301, + "grad_norm": 1.0281141393128763, + "learning_rate": 1.496984552154383e-05, + "loss": 1.0319, + "step": 2090 + }, + { + "epoch": 0.3545570156846121, + "grad_norm": 0.9656993127942579, + "learning_rate": 1.4965078781783882e-05, + "loss": 0.9542, + "step": 2091 + }, + { + "epoch": 0.3547265790589233, + "grad_norm": 0.9888888120020639, + "learning_rate": 1.4960310544292922e-05, + "loss": 0.9819, + "step": 2092 + }, + { + "epoch": 0.3548961424332344, + "grad_norm": 0.9439362596378621, + "learning_rate": 1.495554081050931e-05, + "loss": 0.9828, + "step": 2093 + }, + { + "epoch": 0.3550657058075456, + "grad_norm": 1.027081446998009, + "learning_rate": 1.495076958187185e-05, + "loss": 0.9997, + "step": 2094 + }, + { + "epoch": 0.3552352691818567, + "grad_norm": 0.9904497045976364, + "learning_rate": 1.4945996859819799e-05, + "loss": 1.0001, + "step": 2095 + }, + { + "epoch": 0.3554048325561679, + "grad_norm": 0.966081720739253, + "learning_rate": 1.4941222645792856e-05, + "loss": 0.9337, + "step": 2096 + }, + { + "epoch": 0.355574395930479, + "grad_norm": 0.9744187863278753, + "learning_rate": 1.4936446941231186e-05, + "loss": 1.0045, + "step": 2097 + }, + { + "epoch": 0.3557439593047902, + "grad_norm": 1.0051121518130586, + "learning_rate": 1.4931669747575389e-05, + "loss": 0.9733, + "step": 2098 + }, + { + "epoch": 0.3559135226791013, + "grad_norm": 1.0501871535414808, + "learning_rate": 1.4926891066266523e-05, + "loss": 1.0377, + "step": 2099 + }, + { + "epoch": 0.3560830860534125, + "grad_norm": 0.9446024027913572, + "learning_rate": 1.4922110898746089e-05, + "loss": 0.98, + "step": 2100 + }, + { + "epoch": 0.3562526494277236, + "grad_norm": 0.9866377395826637, + "learning_rate": 1.491732924645604e-05, + "loss": 0.9894, + "step": 2101 + }, + { + "epoch": 0.3564222128020348, + "grad_norm": 0.9352987069621439, + "learning_rate": 1.4912546110838775e-05, + "loss": 0.9674, + "step": 2102 + }, + { + "epoch": 0.3565917761763459, + "grad_norm": 0.7018484045669574, + "learning_rate": 1.4907761493337143e-05, + "loss": 0.8398, + "step": 2103 + }, + { + "epoch": 0.35676133955065703, + "grad_norm": 0.9479973345877905, + "learning_rate": 1.4902975395394437e-05, + "loss": 0.9942, + "step": 2104 + }, + { + "epoch": 0.3569309029249682, + "grad_norm": 0.9931903657381124, + "learning_rate": 1.4898187818454401e-05, + "loss": 1.0144, + "step": 2105 + }, + { + "epoch": 0.35710046629927933, + "grad_norm": 1.0693892511901195, + "learning_rate": 1.4893398763961217e-05, + "loss": 1.0329, + "step": 2106 + }, + { + "epoch": 0.3572700296735905, + "grad_norm": 1.014188407844201, + "learning_rate": 1.488860823335952e-05, + "loss": 1.0117, + "step": 2107 + }, + { + "epoch": 0.3574395930479016, + "grad_norm": 0.9465493571813999, + "learning_rate": 1.4883816228094392e-05, + "loss": 1.0071, + "step": 2108 + }, + { + "epoch": 0.3576091564222128, + "grad_norm": 1.0095925274863478, + "learning_rate": 1.4879022749611349e-05, + "loss": 1.0345, + "step": 2109 + }, + { + "epoch": 0.3577787197965239, + "grad_norm": 0.9526886562301604, + "learning_rate": 1.4874227799356362e-05, + "loss": 0.995, + "step": 2110 + }, + { + "epoch": 0.3579482831708351, + "grad_norm": 0.8995092600181381, + "learning_rate": 1.4869431378775841e-05, + "loss": 0.9326, + "step": 2111 + }, + { + "epoch": 0.3581178465451462, + "grad_norm": 1.003368076243649, + "learning_rate": 1.4864633489316643e-05, + "loss": 1.0139, + "step": 2112 + }, + { + "epoch": 0.3582874099194574, + "grad_norm": 0.9324686182940445, + "learning_rate": 1.485983413242606e-05, + "loss": 0.9916, + "step": 2113 + }, + { + "epoch": 0.3584569732937685, + "grad_norm": 0.9617532631228993, + "learning_rate": 1.4855033309551842e-05, + "loss": 0.9359, + "step": 2114 + }, + { + "epoch": 0.3586265366680797, + "grad_norm": 1.0416784044123946, + "learning_rate": 1.4850231022142163e-05, + "loss": 0.993, + "step": 2115 + }, + { + "epoch": 0.3587961000423908, + "grad_norm": 0.9785218695171009, + "learning_rate": 1.4845427271645654e-05, + "loss": 0.9974, + "step": 2116 + }, + { + "epoch": 0.358965663416702, + "grad_norm": 0.9863400278879627, + "learning_rate": 1.4840622059511376e-05, + "loss": 1.0045, + "step": 2117 + }, + { + "epoch": 0.3591352267910131, + "grad_norm": 1.0519859642889278, + "learning_rate": 1.4835815387188839e-05, + "loss": 1.0044, + "step": 2118 + }, + { + "epoch": 0.3593047901653243, + "grad_norm": 1.0234829669208403, + "learning_rate": 1.4831007256127988e-05, + "loss": 1.0243, + "step": 2119 + }, + { + "epoch": 0.3594743535396354, + "grad_norm": 0.9257391581874472, + "learning_rate": 1.4826197667779207e-05, + "loss": 0.9806, + "step": 2120 + }, + { + "epoch": 0.3596439169139466, + "grad_norm": 0.9551537262342021, + "learning_rate": 1.4821386623593332e-05, + "loss": 0.9819, + "step": 2121 + }, + { + "epoch": 0.3598134802882577, + "grad_norm": 0.6665992334118088, + "learning_rate": 1.4816574125021621e-05, + "loss": 0.8599, + "step": 2122 + }, + { + "epoch": 0.3599830436625689, + "grad_norm": 0.6332320711776434, + "learning_rate": 1.4811760173515783e-05, + "loss": 0.831, + "step": 2123 + }, + { + "epoch": 0.36015260703688, + "grad_norm": 0.6674833253166653, + "learning_rate": 1.4806944770527958e-05, + "loss": 0.8643, + "step": 2124 + }, + { + "epoch": 0.3603221704111912, + "grad_norm": 1.1268055197988103, + "learning_rate": 1.4802127917510731e-05, + "loss": 0.9642, + "step": 2125 + }, + { + "epoch": 0.3604917337855023, + "grad_norm": 0.9515325932740403, + "learning_rate": 1.4797309615917117e-05, + "loss": 0.9983, + "step": 2126 + }, + { + "epoch": 0.3606612971598135, + "grad_norm": 0.960957507632623, + "learning_rate": 1.479248986720057e-05, + "loss": 1.0074, + "step": 2127 + }, + { + "epoch": 0.3608308605341246, + "grad_norm": 1.0134484052530388, + "learning_rate": 1.4787668672814985e-05, + "loss": 0.981, + "step": 2128 + }, + { + "epoch": 0.3610004239084358, + "grad_norm": 1.027105251388156, + "learning_rate": 1.4782846034214689e-05, + "loss": 1.0068, + "step": 2129 + }, + { + "epoch": 0.3611699872827469, + "grad_norm": 0.9975149036700401, + "learning_rate": 1.4778021952854444e-05, + "loss": 1.039, + "step": 2130 + }, + { + "epoch": 0.3613395506570581, + "grad_norm": 1.019092810143808, + "learning_rate": 1.4773196430189448e-05, + "loss": 1.0305, + "step": 2131 + }, + { + "epoch": 0.3615091140313692, + "grad_norm": 1.0211526406156899, + "learning_rate": 1.4768369467675338e-05, + "loss": 0.9966, + "step": 2132 + }, + { + "epoch": 0.3616786774056804, + "grad_norm": 0.9664773308663944, + "learning_rate": 1.476354106676818e-05, + "loss": 1.04, + "step": 2133 + }, + { + "epoch": 0.3618482407799915, + "grad_norm": 0.9636581321467599, + "learning_rate": 1.4758711228924477e-05, + "loss": 1.0166, + "step": 2134 + }, + { + "epoch": 0.3620178041543027, + "grad_norm": 0.9524091775344606, + "learning_rate": 1.4753879955601162e-05, + "loss": 0.9968, + "step": 2135 + }, + { + "epoch": 0.3621873675286138, + "grad_norm": 0.9743728498015906, + "learning_rate": 1.4749047248255605e-05, + "loss": 1.0145, + "step": 2136 + }, + { + "epoch": 0.362356930902925, + "grad_norm": 0.995747233178651, + "learning_rate": 1.4744213108345605e-05, + "loss": 0.9934, + "step": 2137 + }, + { + "epoch": 0.3625264942772361, + "grad_norm": 0.9537191035741277, + "learning_rate": 1.4739377537329396e-05, + "loss": 0.9844, + "step": 2138 + }, + { + "epoch": 0.3626960576515473, + "grad_norm": 0.9956132533069475, + "learning_rate": 1.4734540536665642e-05, + "loss": 0.9991, + "step": 2139 + }, + { + "epoch": 0.3628656210258584, + "grad_norm": 1.0296608724303693, + "learning_rate": 1.4729702107813438e-05, + "loss": 1.0151, + "step": 2140 + }, + { + "epoch": 0.3630351844001696, + "grad_norm": 0.7715803235785113, + "learning_rate": 1.4724862252232313e-05, + "loss": 0.8861, + "step": 2141 + }, + { + "epoch": 0.3632047477744807, + "grad_norm": 0.9720662706062622, + "learning_rate": 1.4720020971382223e-05, + "loss": 0.975, + "step": 2142 + }, + { + "epoch": 0.3633743111487919, + "grad_norm": 1.0156789445922831, + "learning_rate": 1.4715178266723556e-05, + "loss": 0.9931, + "step": 2143 + }, + { + "epoch": 0.363543874523103, + "grad_norm": 0.9772187033008063, + "learning_rate": 1.4710334139717126e-05, + "loss": 0.9775, + "step": 2144 + }, + { + "epoch": 0.36371343789741417, + "grad_norm": 0.985320433902782, + "learning_rate": 1.4705488591824182e-05, + "loss": 0.986, + "step": 2145 + }, + { + "epoch": 0.3638830012717253, + "grad_norm": 0.970568010359772, + "learning_rate": 1.4700641624506392e-05, + "loss": 0.9918, + "step": 2146 + }, + { + "epoch": 0.36405256464603647, + "grad_norm": 1.0871718240769177, + "learning_rate": 1.4695793239225864e-05, + "loss": 1.0057, + "step": 2147 + }, + { + "epoch": 0.3642221280203476, + "grad_norm": 1.0504833631976895, + "learning_rate": 1.4690943437445129e-05, + "loss": 0.9908, + "step": 2148 + }, + { + "epoch": 0.36439169139465877, + "grad_norm": 0.9584611661693286, + "learning_rate": 1.4686092220627139e-05, + "loss": 0.9627, + "step": 2149 + }, + { + "epoch": 0.3645612547689699, + "grad_norm": 0.6324928696992854, + "learning_rate": 1.4681239590235281e-05, + "loss": 0.8321, + "step": 2150 + }, + { + "epoch": 0.36473081814328107, + "grad_norm": 0.9331345808869536, + "learning_rate": 1.4676385547733367e-05, + "loss": 0.972, + "step": 2151 + }, + { + "epoch": 0.3649003815175922, + "grad_norm": 1.0394706859932694, + "learning_rate": 1.467153009458563e-05, + "loss": 0.9955, + "step": 2152 + }, + { + "epoch": 0.36506994489190336, + "grad_norm": 1.0252552588863841, + "learning_rate": 1.4666673232256738e-05, + "loss": 1.0176, + "step": 2153 + }, + { + "epoch": 0.3652395082662145, + "grad_norm": 0.9856233335350563, + "learning_rate": 1.4661814962211775e-05, + "loss": 1.0223, + "step": 2154 + }, + { + "epoch": 0.36540907164052566, + "grad_norm": 1.0176826755835535, + "learning_rate": 1.465695528591625e-05, + "loss": 1.0119, + "step": 2155 + }, + { + "epoch": 0.3655786350148368, + "grad_norm": 1.0322572206424578, + "learning_rate": 1.46520942048361e-05, + "loss": 1.0347, + "step": 2156 + }, + { + "epoch": 0.36574819838914796, + "grad_norm": 0.9532447028554529, + "learning_rate": 1.4647231720437687e-05, + "loss": 1.0126, + "step": 2157 + }, + { + "epoch": 0.3659177617634591, + "grad_norm": 0.9840195638383302, + "learning_rate": 1.4642367834187795e-05, + "loss": 1.034, + "step": 2158 + }, + { + "epoch": 0.36608732513777026, + "grad_norm": 1.0221261421189956, + "learning_rate": 1.4637502547553626e-05, + "loss": 1.0173, + "step": 2159 + }, + { + "epoch": 0.3662568885120814, + "grad_norm": 0.9274807913395638, + "learning_rate": 1.4632635862002811e-05, + "loss": 0.9736, + "step": 2160 + }, + { + "epoch": 0.36642645188639256, + "grad_norm": 0.9784510702615073, + "learning_rate": 1.4627767779003402e-05, + "loss": 0.9457, + "step": 2161 + }, + { + "epoch": 0.3665960152607037, + "grad_norm": 0.9469595398248766, + "learning_rate": 1.4622898300023867e-05, + "loss": 0.9603, + "step": 2162 + }, + { + "epoch": 0.36676557863501486, + "grad_norm": 0.9882823599670864, + "learning_rate": 1.4618027426533102e-05, + "loss": 0.9984, + "step": 2163 + }, + { + "epoch": 0.366935142009326, + "grad_norm": 1.0177975127886503, + "learning_rate": 1.4613155160000419e-05, + "loss": 0.9815, + "step": 2164 + }, + { + "epoch": 0.36710470538363715, + "grad_norm": 0.9443049685903396, + "learning_rate": 1.4608281501895551e-05, + "loss": 0.9402, + "step": 2165 + }, + { + "epoch": 0.3672742687579483, + "grad_norm": 0.9798713176768605, + "learning_rate": 1.4603406453688656e-05, + "loss": 1.0198, + "step": 2166 + }, + { + "epoch": 0.36744383213225945, + "grad_norm": 0.8925247576119087, + "learning_rate": 1.4598530016850302e-05, + "loss": 0.9402, + "step": 2167 + }, + { + "epoch": 0.3676133955065706, + "grad_norm": 0.9693608811871331, + "learning_rate": 1.4593652192851487e-05, + "loss": 0.9633, + "step": 2168 + }, + { + "epoch": 0.36778295888088175, + "grad_norm": 1.0082727665263245, + "learning_rate": 1.4588772983163612e-05, + "loss": 0.9751, + "step": 2169 + }, + { + "epoch": 0.36795252225519287, + "grad_norm": 0.9981943649764854, + "learning_rate": 1.4583892389258517e-05, + "loss": 1.0129, + "step": 2170 + }, + { + "epoch": 0.36812208562950405, + "grad_norm": 0.6188167302342693, + "learning_rate": 1.4579010412608439e-05, + "loss": 0.8494, + "step": 2171 + }, + { + "epoch": 0.36829164900381517, + "grad_norm": 1.043450576241928, + "learning_rate": 1.4574127054686042e-05, + "loss": 0.9476, + "step": 2172 + }, + { + "epoch": 0.36846121237812635, + "grad_norm": 0.9690414053579273, + "learning_rate": 1.4569242316964409e-05, + "loss": 1.002, + "step": 2173 + }, + { + "epoch": 0.36863077575243747, + "grad_norm": 0.9719526425967001, + "learning_rate": 1.4564356200917034e-05, + "loss": 0.9956, + "step": 2174 + }, + { + "epoch": 0.36880033912674864, + "grad_norm": 1.0251037372383711, + "learning_rate": 1.455946870801783e-05, + "loss": 0.953, + "step": 2175 + }, + { + "epoch": 0.36896990250105977, + "grad_norm": 0.9591895258561606, + "learning_rate": 1.455457983974112e-05, + "loss": 0.9768, + "step": 2176 + }, + { + "epoch": 0.36913946587537094, + "grad_norm": 0.9540812839945211, + "learning_rate": 1.4549689597561652e-05, + "loss": 0.9849, + "step": 2177 + }, + { + "epoch": 0.36930902924968206, + "grad_norm": 1.0099051775751366, + "learning_rate": 1.4544797982954578e-05, + "loss": 0.9813, + "step": 2178 + }, + { + "epoch": 0.36947859262399324, + "grad_norm": 1.0169533762607017, + "learning_rate": 1.4539904997395468e-05, + "loss": 1.0313, + "step": 2179 + }, + { + "epoch": 0.36964815599830436, + "grad_norm": 1.0086390668397849, + "learning_rate": 1.453501064236031e-05, + "loss": 1.0469, + "step": 2180 + }, + { + "epoch": 0.36981771937261554, + "grad_norm": 0.9480250100958773, + "learning_rate": 1.4530114919325498e-05, + "loss": 0.9424, + "step": 2181 + }, + { + "epoch": 0.36998728274692666, + "grad_norm": 1.0788100566379721, + "learning_rate": 1.4525217829767842e-05, + "loss": 1.03, + "step": 2182 + }, + { + "epoch": 0.37015684612123784, + "grad_norm": 0.9975613179969579, + "learning_rate": 1.452031937516456e-05, + "loss": 1.0289, + "step": 2183 + }, + { + "epoch": 0.37032640949554896, + "grad_norm": 0.976343456804483, + "learning_rate": 1.4515419556993293e-05, + "loss": 1.0006, + "step": 2184 + }, + { + "epoch": 0.37049597286986014, + "grad_norm": 1.017704153055686, + "learning_rate": 1.4510518376732081e-05, + "loss": 1.0047, + "step": 2185 + }, + { + "epoch": 0.37066553624417126, + "grad_norm": 0.9727329659497488, + "learning_rate": 1.4505615835859383e-05, + "loss": 1.0374, + "step": 2186 + }, + { + "epoch": 0.37083509961848243, + "grad_norm": 0.944014283399242, + "learning_rate": 1.4500711935854062e-05, + "loss": 0.9936, + "step": 2187 + }, + { + "epoch": 0.37100466299279355, + "grad_norm": 0.9063748235013017, + "learning_rate": 1.4495806678195394e-05, + "loss": 0.9684, + "step": 2188 + }, + { + "epoch": 0.37117422636710473, + "grad_norm": 1.0239544505780156, + "learning_rate": 1.449090006436307e-05, + "loss": 0.9892, + "step": 2189 + }, + { + "epoch": 0.37134378974141585, + "grad_norm": 0.6997375076476163, + "learning_rate": 1.4485992095837178e-05, + "loss": 0.8763, + "step": 2190 + }, + { + "epoch": 0.37151335311572703, + "grad_norm": 0.6403112627956185, + "learning_rate": 1.4481082774098227e-05, + "loss": 0.8054, + "step": 2191 + }, + { + "epoch": 0.37168291649003815, + "grad_norm": 0.9876180927042654, + "learning_rate": 1.4476172100627127e-05, + "loss": 1.0073, + "step": 2192 + }, + { + "epoch": 0.37185247986434933, + "grad_norm": 0.6198122269866572, + "learning_rate": 1.4471260076905198e-05, + "loss": 0.8235, + "step": 2193 + }, + { + "epoch": 0.37202204323866045, + "grad_norm": 1.0510590508083053, + "learning_rate": 1.4466346704414163e-05, + "loss": 1.0033, + "step": 2194 + }, + { + "epoch": 0.37219160661297157, + "grad_norm": 1.0165482454816739, + "learning_rate": 1.4461431984636158e-05, + "loss": 1.0139, + "step": 2195 + }, + { + "epoch": 0.37236116998728275, + "grad_norm": 1.0162435144249273, + "learning_rate": 1.4456515919053727e-05, + "loss": 0.9811, + "step": 2196 + }, + { + "epoch": 0.37253073336159387, + "grad_norm": 0.9846996795965636, + "learning_rate": 1.4451598509149809e-05, + "loss": 0.9821, + "step": 2197 + }, + { + "epoch": 0.37270029673590505, + "grad_norm": 0.9996938616630523, + "learning_rate": 1.444667975640776e-05, + "loss": 1.0183, + "step": 2198 + }, + { + "epoch": 0.37286986011021617, + "grad_norm": 0.9467980762814778, + "learning_rate": 1.4441759662311339e-05, + "loss": 0.9919, + "step": 2199 + }, + { + "epoch": 0.37303942348452734, + "grad_norm": 0.9737035329328998, + "learning_rate": 1.44368382283447e-05, + "loss": 0.98, + "step": 2200 + }, + { + "epoch": 0.37320898685883847, + "grad_norm": 1.0077057457004774, + "learning_rate": 1.4431915455992416e-05, + "loss": 1.0321, + "step": 2201 + }, + { + "epoch": 0.37337855023314964, + "grad_norm": 0.9505348512435962, + "learning_rate": 1.442699134673945e-05, + "loss": 0.9831, + "step": 2202 + }, + { + "epoch": 0.37354811360746076, + "grad_norm": 1.006625401724298, + "learning_rate": 1.4422065902071176e-05, + "loss": 0.9995, + "step": 2203 + }, + { + "epoch": 0.37371767698177194, + "grad_norm": 0.9094176063421371, + "learning_rate": 1.4417139123473373e-05, + "loss": 0.9731, + "step": 2204 + }, + { + "epoch": 0.37388724035608306, + "grad_norm": 0.9801924957415139, + "learning_rate": 1.4412211012432213e-05, + "loss": 1.0118, + "step": 2205 + }, + { + "epoch": 0.37405680373039424, + "grad_norm": 1.0180156669913525, + "learning_rate": 1.4407281570434282e-05, + "loss": 0.996, + "step": 2206 + }, + { + "epoch": 0.37422636710470536, + "grad_norm": 0.9510224613961906, + "learning_rate": 1.4402350798966556e-05, + "loss": 1.0211, + "step": 2207 + }, + { + "epoch": 0.37439593047901654, + "grad_norm": 0.9800709714823563, + "learning_rate": 1.4397418699516416e-05, + "loss": 0.9938, + "step": 2208 + }, + { + "epoch": 0.37456549385332766, + "grad_norm": 0.9262473822170325, + "learning_rate": 1.4392485273571652e-05, + "loss": 0.9429, + "step": 2209 + }, + { + "epoch": 0.37473505722763883, + "grad_norm": 0.9631492791663153, + "learning_rate": 1.4387550522620439e-05, + "loss": 0.9878, + "step": 2210 + }, + { + "epoch": 0.37490462060194996, + "grad_norm": 0.9454810573784534, + "learning_rate": 1.4382614448151365e-05, + "loss": 1.0237, + "step": 2211 + }, + { + "epoch": 0.37507418397626113, + "grad_norm": 0.9094834558068051, + "learning_rate": 1.4377677051653404e-05, + "loss": 0.992, + "step": 2212 + }, + { + "epoch": 0.37524374735057225, + "grad_norm": 0.9687674689041978, + "learning_rate": 1.4372738334615947e-05, + "loss": 1.0184, + "step": 2213 + }, + { + "epoch": 0.37541331072488343, + "grad_norm": 1.0144332635558633, + "learning_rate": 1.4367798298528767e-05, + "loss": 0.9686, + "step": 2214 + }, + { + "epoch": 0.37558287409919455, + "grad_norm": 0.9797829910202636, + "learning_rate": 1.4362856944882041e-05, + "loss": 0.9767, + "step": 2215 + }, + { + "epoch": 0.37575243747350573, + "grad_norm": 0.9784401979887182, + "learning_rate": 1.4357914275166344e-05, + "loss": 1.0161, + "step": 2216 + }, + { + "epoch": 0.37592200084781685, + "grad_norm": 0.9715303267227695, + "learning_rate": 1.435297029087265e-05, + "loss": 0.9703, + "step": 2217 + }, + { + "epoch": 0.376091564222128, + "grad_norm": 0.8117783404440864, + "learning_rate": 1.4348024993492323e-05, + "loss": 0.8807, + "step": 2218 + }, + { + "epoch": 0.37626112759643915, + "grad_norm": 0.9897415152435739, + "learning_rate": 1.4343078384517123e-05, + "loss": 1.0423, + "step": 2219 + }, + { + "epoch": 0.3764306909707503, + "grad_norm": 1.0569223100472929, + "learning_rate": 1.433813046543922e-05, + "loss": 1.0141, + "step": 2220 + }, + { + "epoch": 0.37660025434506145, + "grad_norm": 1.012977729852736, + "learning_rate": 1.4333181237751159e-05, + "loss": 0.9955, + "step": 2221 + }, + { + "epoch": 0.3767698177193726, + "grad_norm": 0.9575457575174791, + "learning_rate": 1.4328230702945897e-05, + "loss": 0.9711, + "step": 2222 + }, + { + "epoch": 0.37693938109368375, + "grad_norm": 0.9489275664556899, + "learning_rate": 1.4323278862516774e-05, + "loss": 0.9776, + "step": 2223 + }, + { + "epoch": 0.3771089444679949, + "grad_norm": 1.029675244642999, + "learning_rate": 1.4318325717957526e-05, + "loss": 0.9897, + "step": 2224 + }, + { + "epoch": 0.37727850784230604, + "grad_norm": 1.0343116868914228, + "learning_rate": 1.431337127076229e-05, + "loss": 1.0606, + "step": 2225 + }, + { + "epoch": 0.3774480712166172, + "grad_norm": 0.9397410886584606, + "learning_rate": 1.4308415522425581e-05, + "loss": 0.9878, + "step": 2226 + }, + { + "epoch": 0.37761763459092834, + "grad_norm": 0.9581605756963152, + "learning_rate": 1.4303458474442325e-05, + "loss": 0.9813, + "step": 2227 + }, + { + "epoch": 0.3777871979652395, + "grad_norm": 0.9692264001530689, + "learning_rate": 1.4298500128307821e-05, + "loss": 0.9848, + "step": 2228 + }, + { + "epoch": 0.37795676133955064, + "grad_norm": 1.0172621260051655, + "learning_rate": 1.4293540485517778e-05, + "loss": 1.0044, + "step": 2229 + }, + { + "epoch": 0.3781263247138618, + "grad_norm": 1.0416929080213642, + "learning_rate": 1.4288579547568279e-05, + "loss": 0.9983, + "step": 2230 + }, + { + "epoch": 0.37829588808817294, + "grad_norm": 0.9241512464027279, + "learning_rate": 1.4283617315955815e-05, + "loss": 0.9642, + "step": 2231 + }, + { + "epoch": 0.3784654514624841, + "grad_norm": 1.03148467610165, + "learning_rate": 1.4278653792177251e-05, + "loss": 1.0145, + "step": 2232 + }, + { + "epoch": 0.37863501483679524, + "grad_norm": 0.9313765166250695, + "learning_rate": 1.4273688977729852e-05, + "loss": 0.9589, + "step": 2233 + }, + { + "epoch": 0.3788045782111064, + "grad_norm": 0.9819203662463901, + "learning_rate": 1.4268722874111265e-05, + "loss": 0.9873, + "step": 2234 + }, + { + "epoch": 0.37897414158541753, + "grad_norm": 0.9574039197325604, + "learning_rate": 1.426375548281954e-05, + "loss": 1.0223, + "step": 2235 + }, + { + "epoch": 0.3791437049597287, + "grad_norm": 1.0218278931507878, + "learning_rate": 1.4258786805353095e-05, + "loss": 1.0137, + "step": 2236 + }, + { + "epoch": 0.37931326833403983, + "grad_norm": 1.016045779585104, + "learning_rate": 1.4253816843210751e-05, + "loss": 0.9828, + "step": 2237 + }, + { + "epoch": 0.379482831708351, + "grad_norm": 0.9735847159586615, + "learning_rate": 1.4248845597891712e-05, + "loss": 1.0052, + "step": 2238 + }, + { + "epoch": 0.37965239508266213, + "grad_norm": 0.9431836042880376, + "learning_rate": 1.4243873070895569e-05, + "loss": 0.9726, + "step": 2239 + }, + { + "epoch": 0.3798219584569733, + "grad_norm": 0.9845608542832943, + "learning_rate": 1.4238899263722301e-05, + "loss": 1.0038, + "step": 2240 + }, + { + "epoch": 0.37999152183128443, + "grad_norm": 1.0362904448017518, + "learning_rate": 1.4233924177872269e-05, + "loss": 1.0141, + "step": 2241 + }, + { + "epoch": 0.3801610852055956, + "grad_norm": 0.984863735245617, + "learning_rate": 1.4228947814846226e-05, + "loss": 0.99, + "step": 2242 + }, + { + "epoch": 0.3803306485799067, + "grad_norm": 0.9934261084736491, + "learning_rate": 1.4223970176145303e-05, + "loss": 1.0171, + "step": 2243 + }, + { + "epoch": 0.3805002119542179, + "grad_norm": 0.923604353066388, + "learning_rate": 1.4218991263271024e-05, + "loss": 0.9671, + "step": 2244 + }, + { + "epoch": 0.380669775328529, + "grad_norm": 0.9422331404867692, + "learning_rate": 1.4214011077725293e-05, + "loss": 0.9995, + "step": 2245 + }, + { + "epoch": 0.3808393387028402, + "grad_norm": 0.9844237079257445, + "learning_rate": 1.4209029621010393e-05, + "loss": 1.018, + "step": 2246 + }, + { + "epoch": 0.3810089020771513, + "grad_norm": 0.9877117202318388, + "learning_rate": 1.4204046894629002e-05, + "loss": 0.9497, + "step": 2247 + }, + { + "epoch": 0.3811784654514625, + "grad_norm": 0.9540860994891954, + "learning_rate": 1.4199062900084168e-05, + "loss": 1.0166, + "step": 2248 + }, + { + "epoch": 0.3813480288257736, + "grad_norm": 0.9882145090140241, + "learning_rate": 1.4194077638879333e-05, + "loss": 0.9702, + "step": 2249 + }, + { + "epoch": 0.3815175922000848, + "grad_norm": 0.9348298828640021, + "learning_rate": 1.4189091112518311e-05, + "loss": 0.9683, + "step": 2250 + }, + { + "epoch": 0.3816871555743959, + "grad_norm": 0.9961979049149148, + "learning_rate": 1.4184103322505311e-05, + "loss": 1.0026, + "step": 2251 + }, + { + "epoch": 0.3818567189487071, + "grad_norm": 1.0033325432211073, + "learning_rate": 1.4179114270344907e-05, + "loss": 1.0179, + "step": 2252 + }, + { + "epoch": 0.3820262823230182, + "grad_norm": 0.9892249197024652, + "learning_rate": 1.4174123957542065e-05, + "loss": 0.9916, + "step": 2253 + }, + { + "epoch": 0.3821958456973294, + "grad_norm": 0.9983865401910622, + "learning_rate": 1.4169132385602129e-05, + "loss": 1.0073, + "step": 2254 + }, + { + "epoch": 0.3823654090716405, + "grad_norm": 0.9590033385672899, + "learning_rate": 1.4164139556030818e-05, + "loss": 0.9712, + "step": 2255 + }, + { + "epoch": 0.3825349724459517, + "grad_norm": 0.9695565529420129, + "learning_rate": 1.4159145470334237e-05, + "loss": 0.9751, + "step": 2256 + }, + { + "epoch": 0.3827045358202628, + "grad_norm": 0.9910308237044819, + "learning_rate": 1.4154150130018867e-05, + "loss": 0.9774, + "step": 2257 + }, + { + "epoch": 0.382874099194574, + "grad_norm": 0.9557961258187446, + "learning_rate": 1.4149153536591565e-05, + "loss": 0.9551, + "step": 2258 + }, + { + "epoch": 0.3830436625688851, + "grad_norm": 0.9970827490185687, + "learning_rate": 1.4144155691559571e-05, + "loss": 1.0184, + "step": 2259 + }, + { + "epoch": 0.3832132259431963, + "grad_norm": 0.9891074259573283, + "learning_rate": 1.4139156596430501e-05, + "loss": 0.9608, + "step": 2260 + }, + { + "epoch": 0.3833827893175074, + "grad_norm": 1.0594408406629534, + "learning_rate": 1.4134156252712343e-05, + "loss": 0.9889, + "step": 2261 + }, + { + "epoch": 0.3835523526918186, + "grad_norm": 1.1090230365809288, + "learning_rate": 1.412915466191347e-05, + "loss": 0.9771, + "step": 2262 + }, + { + "epoch": 0.3837219160661297, + "grad_norm": 1.0212738242529205, + "learning_rate": 1.4124151825542627e-05, + "loss": 1.0285, + "step": 2263 + }, + { + "epoch": 0.3838914794404409, + "grad_norm": 1.004476177615105, + "learning_rate": 1.411914774510893e-05, + "loss": 1.0025, + "step": 2264 + }, + { + "epoch": 0.384061042814752, + "grad_norm": 0.9617465797965565, + "learning_rate": 1.4114142422121879e-05, + "loss": 1.0129, + "step": 2265 + }, + { + "epoch": 0.3842306061890632, + "grad_norm": 0.9467418934625155, + "learning_rate": 1.4109135858091344e-05, + "loss": 1.0114, + "step": 2266 + }, + { + "epoch": 0.3844001695633743, + "grad_norm": 0.9635988950957718, + "learning_rate": 1.410412805452757e-05, + "loss": 0.9708, + "step": 2267 + }, + { + "epoch": 0.3845697329376855, + "grad_norm": 0.9503973525471091, + "learning_rate": 1.4099119012941173e-05, + "loss": 0.9677, + "step": 2268 + }, + { + "epoch": 0.3847392963119966, + "grad_norm": 1.0411407504075734, + "learning_rate": 1.4094108734843155e-05, + "loss": 1.0432, + "step": 2269 + }, + { + "epoch": 0.3849088596863078, + "grad_norm": 1.0373513596907145, + "learning_rate": 1.408909722174487e-05, + "loss": 0.9908, + "step": 2270 + }, + { + "epoch": 0.3850784230606189, + "grad_norm": 0.9424784639599362, + "learning_rate": 1.4084084475158062e-05, + "loss": 1.0313, + "step": 2271 + }, + { + "epoch": 0.3852479864349301, + "grad_norm": 1.0223250393083914, + "learning_rate": 1.407907049659484e-05, + "loss": 0.9826, + "step": 2272 + }, + { + "epoch": 0.3854175498092412, + "grad_norm": 1.0539912645044094, + "learning_rate": 1.4074055287567685e-05, + "loss": 0.975, + "step": 2273 + }, + { + "epoch": 0.3855871131835524, + "grad_norm": 0.9603510715083279, + "learning_rate": 1.4069038849589456e-05, + "loss": 0.9522, + "step": 2274 + }, + { + "epoch": 0.3857566765578635, + "grad_norm": 1.0206219997121104, + "learning_rate": 1.4064021184173364e-05, + "loss": 0.9982, + "step": 2275 + }, + { + "epoch": 0.3859262399321747, + "grad_norm": 0.9640249416543981, + "learning_rate": 1.4059002292833018e-05, + "loss": 0.9958, + "step": 2276 + }, + { + "epoch": 0.3860958033064858, + "grad_norm": 1.0024036919620214, + "learning_rate": 1.4053982177082369e-05, + "loss": 0.9935, + "step": 2277 + }, + { + "epoch": 0.386265366680797, + "grad_norm": 1.0034321220658353, + "learning_rate": 1.4048960838435755e-05, + "loss": 1.0057, + "step": 2278 + }, + { + "epoch": 0.3864349300551081, + "grad_norm": 0.9827904519728128, + "learning_rate": 1.404393827840788e-05, + "loss": 1.0133, + "step": 2279 + }, + { + "epoch": 0.38660449342941927, + "grad_norm": 0.9676678752987752, + "learning_rate": 1.4038914498513813e-05, + "loss": 0.9639, + "step": 2280 + }, + { + "epoch": 0.3867740568037304, + "grad_norm": 0.9131314769165926, + "learning_rate": 1.4033889500268991e-05, + "loss": 0.9869, + "step": 2281 + }, + { + "epoch": 0.38694362017804157, + "grad_norm": 0.9614336932564229, + "learning_rate": 1.4028863285189225e-05, + "loss": 1.0046, + "step": 2282 + }, + { + "epoch": 0.3871131835523527, + "grad_norm": 0.9841388849049107, + "learning_rate": 1.4023835854790682e-05, + "loss": 1.0021, + "step": 2283 + }, + { + "epoch": 0.38728274692666387, + "grad_norm": 1.0003572069132323, + "learning_rate": 1.40188072105899e-05, + "loss": 1.0084, + "step": 2284 + }, + { + "epoch": 0.387452310300975, + "grad_norm": 1.0110159730401558, + "learning_rate": 1.401377735410379e-05, + "loss": 1.0084, + "step": 2285 + }, + { + "epoch": 0.3876218736752861, + "grad_norm": 0.9927037926213683, + "learning_rate": 1.4008746286849621e-05, + "loss": 0.9857, + "step": 2286 + }, + { + "epoch": 0.3877914370495973, + "grad_norm": 1.0249747117143282, + "learning_rate": 1.4003714010345031e-05, + "loss": 0.9924, + "step": 2287 + }, + { + "epoch": 0.3879610004239084, + "grad_norm": 0.9410591169179235, + "learning_rate": 1.3998680526108022e-05, + "loss": 0.9969, + "step": 2288 + }, + { + "epoch": 0.3881305637982196, + "grad_norm": 0.9738227364855141, + "learning_rate": 1.3993645835656955e-05, + "loss": 0.9908, + "step": 2289 + }, + { + "epoch": 0.3883001271725307, + "grad_norm": 1.0674994018153947, + "learning_rate": 1.3988609940510566e-05, + "loss": 0.9943, + "step": 2290 + }, + { + "epoch": 0.3884696905468419, + "grad_norm": 0.963425959620499, + "learning_rate": 1.3983572842187945e-05, + "loss": 1.0015, + "step": 2291 + }, + { + "epoch": 0.388639253921153, + "grad_norm": 0.9330659557033109, + "learning_rate": 1.3978534542208549e-05, + "loss": 0.9458, + "step": 2292 + }, + { + "epoch": 0.3888088172954642, + "grad_norm": 1.000908880595011, + "learning_rate": 1.3973495042092192e-05, + "loss": 1.0249, + "step": 2293 + }, + { + "epoch": 0.3889783806697753, + "grad_norm": 1.0122951520039227, + "learning_rate": 1.3968454343359057e-05, + "loss": 0.9754, + "step": 2294 + }, + { + "epoch": 0.3891479440440865, + "grad_norm": 0.9546805852584749, + "learning_rate": 1.3963412447529687e-05, + "loss": 0.9701, + "step": 2295 + }, + { + "epoch": 0.3893175074183976, + "grad_norm": 0.9300815730903562, + "learning_rate": 1.3958369356124986e-05, + "loss": 0.9654, + "step": 2296 + }, + { + "epoch": 0.3894870707927088, + "grad_norm": 1.0062499873061528, + "learning_rate": 1.3953325070666215e-05, + "loss": 1.0035, + "step": 2297 + }, + { + "epoch": 0.3896566341670199, + "grad_norm": 0.9767582937259379, + "learning_rate": 1.3948279592675e-05, + "loss": 0.9764, + "step": 2298 + }, + { + "epoch": 0.3898261975413311, + "grad_norm": 0.9901494524437389, + "learning_rate": 1.3943232923673327e-05, + "loss": 0.9757, + "step": 2299 + }, + { + "epoch": 0.3899957609156422, + "grad_norm": 0.950968865304944, + "learning_rate": 1.3938185065183534e-05, + "loss": 0.9618, + "step": 2300 + }, + { + "epoch": 0.3901653242899534, + "grad_norm": 0.9572536218868889, + "learning_rate": 1.3933136018728324e-05, + "loss": 0.9993, + "step": 2301 + }, + { + "epoch": 0.3903348876642645, + "grad_norm": 1.0497177292160054, + "learning_rate": 1.3928085785830758e-05, + "loss": 0.9822, + "step": 2302 + }, + { + "epoch": 0.3905044510385757, + "grad_norm": 0.9369346921670006, + "learning_rate": 1.3923034368014254e-05, + "loss": 0.9512, + "step": 2303 + }, + { + "epoch": 0.3906740144128868, + "grad_norm": 0.9690635693058144, + "learning_rate": 1.3917981766802585e-05, + "loss": 0.9786, + "step": 2304 + }, + { + "epoch": 0.39084357778719797, + "grad_norm": 0.9969530028584265, + "learning_rate": 1.3912927983719888e-05, + "loss": 0.994, + "step": 2305 + }, + { + "epoch": 0.3910131411615091, + "grad_norm": 0.9270820900469808, + "learning_rate": 1.3907873020290653e-05, + "loss": 0.9942, + "step": 2306 + }, + { + "epoch": 0.39118270453582027, + "grad_norm": 0.9129685147582283, + "learning_rate": 1.3902816878039715e-05, + "loss": 0.9585, + "step": 2307 + }, + { + "epoch": 0.3913522679101314, + "grad_norm": 1.0004351454247427, + "learning_rate": 1.3897759558492286e-05, + "loss": 0.9886, + "step": 2308 + }, + { + "epoch": 0.39152183128444257, + "grad_norm": 0.9440282904317703, + "learning_rate": 1.3892701063173917e-05, + "loss": 0.9917, + "step": 2309 + }, + { + "epoch": 0.3916913946587537, + "grad_norm": 0.9880527346461876, + "learning_rate": 1.3887641393610518e-05, + "loss": 1.0319, + "step": 2310 + }, + { + "epoch": 0.39186095803306487, + "grad_norm": 0.9337751572129022, + "learning_rate": 1.388258055132835e-05, + "loss": 1.0158, + "step": 2311 + }, + { + "epoch": 0.392030521407376, + "grad_norm": 1.020068420244944, + "learning_rate": 1.387751853785404e-05, + "loss": 1.0426, + "step": 2312 + }, + { + "epoch": 0.39220008478168716, + "grad_norm": 0.9463995407653231, + "learning_rate": 1.3872455354714552e-05, + "loss": 0.9785, + "step": 2313 + }, + { + "epoch": 0.3923696481559983, + "grad_norm": 0.7160668602145898, + "learning_rate": 1.3867391003437213e-05, + "loss": 0.8938, + "step": 2314 + }, + { + "epoch": 0.39253921153030946, + "grad_norm": 0.9751857309982119, + "learning_rate": 1.3862325485549702e-05, + "loss": 1.0085, + "step": 2315 + }, + { + "epoch": 0.3927087749046206, + "grad_norm": 0.9297358961999248, + "learning_rate": 1.3857258802580045e-05, + "loss": 0.9816, + "step": 2316 + }, + { + "epoch": 0.39287833827893176, + "grad_norm": 0.9910030691848873, + "learning_rate": 1.3852190956056623e-05, + "loss": 1.0043, + "step": 2317 + }, + { + "epoch": 0.3930479016532429, + "grad_norm": 0.6138639273144628, + "learning_rate": 1.384712194750817e-05, + "loss": 0.8091, + "step": 2318 + }, + { + "epoch": 0.39321746502755406, + "grad_norm": 0.9627117654545204, + "learning_rate": 1.3842051778463765e-05, + "loss": 1.004, + "step": 2319 + }, + { + "epoch": 0.3933870284018652, + "grad_norm": 0.9633553273816692, + "learning_rate": 1.3836980450452836e-05, + "loss": 1.0157, + "step": 2320 + }, + { + "epoch": 0.39355659177617636, + "grad_norm": 0.9717303316148921, + "learning_rate": 1.3831907965005173e-05, + "loss": 1.0218, + "step": 2321 + }, + { + "epoch": 0.3937261551504875, + "grad_norm": 0.9880904793965707, + "learning_rate": 1.3826834323650899e-05, + "loss": 1.0018, + "step": 2322 + }, + { + "epoch": 0.39389571852479865, + "grad_norm": 0.9787174398100366, + "learning_rate": 1.3821759527920496e-05, + "loss": 0.9629, + "step": 2323 + }, + { + "epoch": 0.3940652818991098, + "grad_norm": 0.965613579832241, + "learning_rate": 1.3816683579344794e-05, + "loss": 0.9996, + "step": 2324 + }, + { + "epoch": 0.39423484527342095, + "grad_norm": 0.9429723518525117, + "learning_rate": 1.3811606479454961e-05, + "loss": 0.9358, + "step": 2325 + }, + { + "epoch": 0.3944044086477321, + "grad_norm": 0.9385873522805684, + "learning_rate": 1.380652822978253e-05, + "loss": 0.9991, + "step": 2326 + }, + { + "epoch": 0.39457397202204325, + "grad_norm": 0.9761246085993821, + "learning_rate": 1.3801448831859363e-05, + "loss": 1.0179, + "step": 2327 + }, + { + "epoch": 0.3947435353963544, + "grad_norm": 0.9555085872264782, + "learning_rate": 1.3796368287217678e-05, + "loss": 0.9456, + "step": 2328 + }, + { + "epoch": 0.39491309877066555, + "grad_norm": 0.9521487829442625, + "learning_rate": 1.3791286597390035e-05, + "loss": 0.9577, + "step": 2329 + }, + { + "epoch": 0.39508266214497667, + "grad_norm": 0.9959551070660362, + "learning_rate": 1.3786203763909342e-05, + "loss": 1.0182, + "step": 2330 + }, + { + "epoch": 0.39525222551928785, + "grad_norm": 1.0391903131074671, + "learning_rate": 1.378111978830885e-05, + "loss": 1.0088, + "step": 2331 + }, + { + "epoch": 0.39542178889359897, + "grad_norm": 1.0557011150279045, + "learning_rate": 1.3776034672122158e-05, + "loss": 1.0137, + "step": 2332 + }, + { + "epoch": 0.39559135226791015, + "grad_norm": 0.7014278160263907, + "learning_rate": 1.3770948416883205e-05, + "loss": 0.8679, + "step": 2333 + }, + { + "epoch": 0.39576091564222127, + "grad_norm": 0.9675665903239292, + "learning_rate": 1.3765861024126275e-05, + "loss": 0.9765, + "step": 2334 + }, + { + "epoch": 0.39593047901653244, + "grad_norm": 0.9943169198576736, + "learning_rate": 1.3760772495385998e-05, + "loss": 1.0241, + "step": 2335 + }, + { + "epoch": 0.39610004239084357, + "grad_norm": 0.9699082319319825, + "learning_rate": 1.3755682832197343e-05, + "loss": 0.9749, + "step": 2336 + }, + { + "epoch": 0.39626960576515474, + "grad_norm": 0.6440441366656101, + "learning_rate": 1.375059203609562e-05, + "loss": 0.916, + "step": 2337 + }, + { + "epoch": 0.39643916913946586, + "grad_norm": 0.9688385963827819, + "learning_rate": 1.3745500108616482e-05, + "loss": 0.946, + "step": 2338 + }, + { + "epoch": 0.39660873251377704, + "grad_norm": 0.9552298362122215, + "learning_rate": 1.3740407051295931e-05, + "loss": 0.9592, + "step": 2339 + }, + { + "epoch": 0.39677829588808816, + "grad_norm": 0.9513457224844829, + "learning_rate": 1.3735312865670296e-05, + "loss": 0.9852, + "step": 2340 + }, + { + "epoch": 0.39694785926239934, + "grad_norm": 0.9298622550155837, + "learning_rate": 1.3730217553276257e-05, + "loss": 0.9872, + "step": 2341 + }, + { + "epoch": 0.39711742263671046, + "grad_norm": 0.93478486799767, + "learning_rate": 1.372512111565083e-05, + "loss": 1.0183, + "step": 2342 + }, + { + "epoch": 0.39728698601102164, + "grad_norm": 0.9326465792693526, + "learning_rate": 1.372002355433137e-05, + "loss": 0.9716, + "step": 2343 + }, + { + "epoch": 0.39745654938533276, + "grad_norm": 0.941124361783271, + "learning_rate": 1.3714924870855573e-05, + "loss": 0.9615, + "step": 2344 + }, + { + "epoch": 0.39762611275964393, + "grad_norm": 0.9596761856573279, + "learning_rate": 1.370982506676147e-05, + "loss": 1.0, + "step": 2345 + }, + { + "epoch": 0.39779567613395506, + "grad_norm": 0.9480477059089689, + "learning_rate": 1.3704724143587438e-05, + "loss": 0.9776, + "step": 2346 + }, + { + "epoch": 0.39796523950826623, + "grad_norm": 0.9673078521470359, + "learning_rate": 1.3699622102872177e-05, + "loss": 0.9883, + "step": 2347 + }, + { + "epoch": 0.39813480288257735, + "grad_norm": 0.9476460109969874, + "learning_rate": 1.369451894615474e-05, + "loss": 0.9951, + "step": 2348 + }, + { + "epoch": 0.39830436625688853, + "grad_norm": 0.9250379836292365, + "learning_rate": 1.3689414674974506e-05, + "loss": 1.027, + "step": 2349 + }, + { + "epoch": 0.39847392963119965, + "grad_norm": 0.9764037045224021, + "learning_rate": 1.3684309290871194e-05, + "loss": 0.9903, + "step": 2350 + }, + { + "epoch": 0.39864349300551083, + "grad_norm": 0.9227253413425622, + "learning_rate": 1.3679202795384862e-05, + "loss": 0.9733, + "step": 2351 + }, + { + "epoch": 0.39881305637982195, + "grad_norm": 1.0046961541746833, + "learning_rate": 1.3674095190055895e-05, + "loss": 0.9933, + "step": 2352 + }, + { + "epoch": 0.3989826197541331, + "grad_norm": 0.9835904674618511, + "learning_rate": 1.3668986476425024e-05, + "loss": 0.9771, + "step": 2353 + }, + { + "epoch": 0.39915218312844425, + "grad_norm": 0.9753961263363442, + "learning_rate": 1.3663876656033303e-05, + "loss": 0.9762, + "step": 2354 + }, + { + "epoch": 0.3993217465027554, + "grad_norm": 0.9703335688555745, + "learning_rate": 1.3658765730422126e-05, + "loss": 0.9805, + "step": 2355 + }, + { + "epoch": 0.39949130987706655, + "grad_norm": 0.9852702932838664, + "learning_rate": 1.3653653701133215e-05, + "loss": 0.9964, + "step": 2356 + }, + { + "epoch": 0.3996608732513777, + "grad_norm": 0.9437667158938094, + "learning_rate": 1.3648540569708637e-05, + "loss": 0.9824, + "step": 2357 + }, + { + "epoch": 0.39983043662568885, + "grad_norm": 0.9688775348783184, + "learning_rate": 1.3643426337690776e-05, + "loss": 0.9806, + "step": 2358 + }, + { + "epoch": 0.4, + "grad_norm": 0.6782503039549057, + "learning_rate": 1.3638311006622357e-05, + "loss": 0.8288, + "step": 2359 + }, + { + "epoch": 0.40016956337431114, + "grad_norm": 0.9725599257213288, + "learning_rate": 1.3633194578046443e-05, + "loss": 1.0057, + "step": 2360 + }, + { + "epoch": 0.4003391267486223, + "grad_norm": 0.9370621842535285, + "learning_rate": 1.362807705350641e-05, + "loss": 1.0252, + "step": 2361 + }, + { + "epoch": 0.40050869012293344, + "grad_norm": 0.9611046417496658, + "learning_rate": 1.3622958434545983e-05, + "loss": 1.0411, + "step": 2362 + }, + { + "epoch": 0.4006782534972446, + "grad_norm": 0.9589236347776622, + "learning_rate": 1.3617838722709203e-05, + "loss": 0.9837, + "step": 2363 + }, + { + "epoch": 0.40084781687155574, + "grad_norm": 1.0071145149316638, + "learning_rate": 1.3612717919540446e-05, + "loss": 0.9515, + "step": 2364 + }, + { + "epoch": 0.4010173802458669, + "grad_norm": 0.9278981149898994, + "learning_rate": 1.3607596026584423e-05, + "loss": 0.9549, + "step": 2365 + }, + { + "epoch": 0.40118694362017804, + "grad_norm": 0.9919482431933654, + "learning_rate": 1.3602473045386165e-05, + "loss": 1.0008, + "step": 2366 + }, + { + "epoch": 0.4013565069944892, + "grad_norm": 0.9473502431654154, + "learning_rate": 1.3597348977491031e-05, + "loss": 0.9615, + "step": 2367 + }, + { + "epoch": 0.40152607036880034, + "grad_norm": 0.9745121739828702, + "learning_rate": 1.3592223824444716e-05, + "loss": 0.9665, + "step": 2368 + }, + { + "epoch": 0.4016956337431115, + "grad_norm": 0.9669094116206499, + "learning_rate": 1.3587097587793243e-05, + "loss": 1.0231, + "step": 2369 + }, + { + "epoch": 0.40186519711742263, + "grad_norm": 0.9682205650910038, + "learning_rate": 1.3581970269082948e-05, + "loss": 0.9922, + "step": 2370 + }, + { + "epoch": 0.4020347604917338, + "grad_norm": 0.9530206159840949, + "learning_rate": 1.3576841869860506e-05, + "loss": 0.9706, + "step": 2371 + }, + { + "epoch": 0.40220432386604493, + "grad_norm": 0.9870412917112557, + "learning_rate": 1.3571712391672916e-05, + "loss": 1.0111, + "step": 2372 + }, + { + "epoch": 0.4023738872403561, + "grad_norm": 0.9267932252121701, + "learning_rate": 1.3566581836067495e-05, + "loss": 0.9459, + "step": 2373 + }, + { + "epoch": 0.40254345061466723, + "grad_norm": 0.9860442732113808, + "learning_rate": 1.3561450204591898e-05, + "loss": 0.9925, + "step": 2374 + }, + { + "epoch": 0.4027130139889784, + "grad_norm": 0.9869017207559949, + "learning_rate": 1.3556317498794086e-05, + "loss": 1.0021, + "step": 2375 + }, + { + "epoch": 0.40288257736328953, + "grad_norm": 0.9491023007688814, + "learning_rate": 1.355118372022237e-05, + "loss": 0.9882, + "step": 2376 + }, + { + "epoch": 0.4030521407376007, + "grad_norm": 1.0022711307777008, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.9684, + "step": 2377 + }, + { + "epoch": 0.4032217041119118, + "grad_norm": 0.9599343764083719, + "learning_rate": 1.3540912950951998e-05, + "loss": 1.0161, + "step": 2378 + }, + { + "epoch": 0.40339126748622295, + "grad_norm": 0.9608963061237126, + "learning_rate": 1.3535775963351552e-05, + "loss": 0.9945, + "step": 2379 + }, + { + "epoch": 0.4035608308605341, + "grad_norm": 1.0126554227041162, + "learning_rate": 1.3530637909173614e-05, + "loss": 1.0129, + "step": 2380 + }, + { + "epoch": 0.40373039423484525, + "grad_norm": 0.9510465833450211, + "learning_rate": 1.3525498789968088e-05, + "loss": 1.0069, + "step": 2381 + }, + { + "epoch": 0.4038999576091564, + "grad_norm": 0.9662505541085203, + "learning_rate": 1.3520358607285208e-05, + "loss": 0.9786, + "step": 2382 + }, + { + "epoch": 0.40406952098346754, + "grad_norm": 0.9358741434520518, + "learning_rate": 1.3515217362675524e-05, + "loss": 0.9834, + "step": 2383 + }, + { + "epoch": 0.4042390843577787, + "grad_norm": 0.9447549474330913, + "learning_rate": 1.3510075057689906e-05, + "loss": 0.9826, + "step": 2384 + }, + { + "epoch": 0.40440864773208984, + "grad_norm": 0.9754158889648276, + "learning_rate": 1.3504931693879553e-05, + "loss": 0.9733, + "step": 2385 + }, + { + "epoch": 0.404578211106401, + "grad_norm": 0.9438735781819981, + "learning_rate": 1.3499787272795968e-05, + "loss": 0.9905, + "step": 2386 + }, + { + "epoch": 0.40474777448071214, + "grad_norm": 0.9827473939629118, + "learning_rate": 1.3494641795990986e-05, + "loss": 0.9655, + "step": 2387 + }, + { + "epoch": 0.4049173378550233, + "grad_norm": 0.9383232463399549, + "learning_rate": 1.3489495265016753e-05, + "loss": 1.006, + "step": 2388 + }, + { + "epoch": 0.40508690122933444, + "grad_norm": 0.9758535360802354, + "learning_rate": 1.3484347681425739e-05, + "loss": 0.9831, + "step": 2389 + }, + { + "epoch": 0.4052564646036456, + "grad_norm": 0.9825369535616311, + "learning_rate": 1.3479199046770722e-05, + "loss": 0.9735, + "step": 2390 + }, + { + "epoch": 0.40542602797795674, + "grad_norm": 0.9649485719338737, + "learning_rate": 1.3474049362604809e-05, + "loss": 1.001, + "step": 2391 + }, + { + "epoch": 0.4055955913522679, + "grad_norm": 1.0015840462951928, + "learning_rate": 1.3468898630481417e-05, + "loss": 1.0181, + "step": 2392 + }, + { + "epoch": 0.40576515472657904, + "grad_norm": 0.9838667856306663, + "learning_rate": 1.3463746851954275e-05, + "loss": 1.0095, + "step": 2393 + }, + { + "epoch": 0.4059347181008902, + "grad_norm": 0.9270255123873952, + "learning_rate": 1.3458594028577444e-05, + "loss": 0.955, + "step": 2394 + }, + { + "epoch": 0.40610428147520133, + "grad_norm": 0.9342560383409726, + "learning_rate": 1.3453440161905274e-05, + "loss": 0.9467, + "step": 2395 + }, + { + "epoch": 0.4062738448495125, + "grad_norm": 0.9815254745927546, + "learning_rate": 1.3448285253492455e-05, + "loss": 1.0145, + "step": 2396 + }, + { + "epoch": 0.40644340822382363, + "grad_norm": 0.9442279755898566, + "learning_rate": 1.3443129304893974e-05, + "loss": 0.9324, + "step": 2397 + }, + { + "epoch": 0.4066129715981348, + "grad_norm": 1.0353538609136759, + "learning_rate": 1.3437972317665144e-05, + "loss": 1.0014, + "step": 2398 + }, + { + "epoch": 0.40678253497244593, + "grad_norm": 0.9865215179030016, + "learning_rate": 1.3432814293361585e-05, + "loss": 0.9297, + "step": 2399 + }, + { + "epoch": 0.4069520983467571, + "grad_norm": 0.9825846108050071, + "learning_rate": 1.3427655233539227e-05, + "loss": 0.9928, + "step": 2400 + }, + { + "epoch": 0.40712166172106823, + "grad_norm": 0.9625004527519626, + "learning_rate": 1.342249513975432e-05, + "loss": 0.9887, + "step": 2401 + }, + { + "epoch": 0.4072912250953794, + "grad_norm": 0.9481900000656314, + "learning_rate": 1.3417334013563417e-05, + "loss": 0.9537, + "step": 2402 + }, + { + "epoch": 0.4074607884696905, + "grad_norm": 0.9762642004668662, + "learning_rate": 1.3412171856523393e-05, + "loss": 0.9828, + "step": 2403 + }, + { + "epoch": 0.4076303518440017, + "grad_norm": 0.9813760903086608, + "learning_rate": 1.3407008670191422e-05, + "loss": 1.0018, + "step": 2404 + }, + { + "epoch": 0.4077999152183128, + "grad_norm": 0.9411898952849121, + "learning_rate": 1.3401844456125002e-05, + "loss": 0.9346, + "step": 2405 + }, + { + "epoch": 0.407969478592624, + "grad_norm": 1.0237953771190902, + "learning_rate": 1.3396679215881924e-05, + "loss": 1.0019, + "step": 2406 + }, + { + "epoch": 0.4081390419669351, + "grad_norm": 0.6681021351925773, + "learning_rate": 1.339151295102031e-05, + "loss": 0.8287, + "step": 2407 + }, + { + "epoch": 0.4083086053412463, + "grad_norm": 0.9659366012451951, + "learning_rate": 1.3386345663098573e-05, + "loss": 0.9598, + "step": 2408 + }, + { + "epoch": 0.4084781687155574, + "grad_norm": 0.9986445473341954, + "learning_rate": 1.3381177353675441e-05, + "loss": 1.0035, + "step": 2409 + }, + { + "epoch": 0.4086477320898686, + "grad_norm": 0.9812635937885128, + "learning_rate": 1.337600802430995e-05, + "loss": 0.9823, + "step": 2410 + }, + { + "epoch": 0.4088172954641797, + "grad_norm": 0.9990223069801192, + "learning_rate": 1.3370837676561443e-05, + "loss": 0.9658, + "step": 2411 + }, + { + "epoch": 0.4089868588384909, + "grad_norm": 0.9546358726849846, + "learning_rate": 1.3365666311989579e-05, + "loss": 0.9634, + "step": 2412 + }, + { + "epoch": 0.409156422212802, + "grad_norm": 0.9867318766504926, + "learning_rate": 1.3360493932154301e-05, + "loss": 0.9645, + "step": 2413 + }, + { + "epoch": 0.4093259855871132, + "grad_norm": 0.9907507522666065, + "learning_rate": 1.3355320538615888e-05, + "loss": 1.0248, + "step": 2414 + }, + { + "epoch": 0.4094955489614243, + "grad_norm": 0.9541243441659322, + "learning_rate": 1.33501461329349e-05, + "loss": 0.9904, + "step": 2415 + }, + { + "epoch": 0.4096651123357355, + "grad_norm": 0.9868115088638221, + "learning_rate": 1.3344970716672217e-05, + "loss": 0.9936, + "step": 2416 + }, + { + "epoch": 0.4098346757100466, + "grad_norm": 0.9912650716851038, + "learning_rate": 1.3339794291389015e-05, + "loss": 0.9875, + "step": 2417 + }, + { + "epoch": 0.4100042390843578, + "grad_norm": 1.0052604483444878, + "learning_rate": 1.3334616858646783e-05, + "loss": 1.015, + "step": 2418 + }, + { + "epoch": 0.4101738024586689, + "grad_norm": 0.9746475443318042, + "learning_rate": 1.3329438420007306e-05, + "loss": 1.0029, + "step": 2419 + }, + { + "epoch": 0.4103433658329801, + "grad_norm": 1.024861447866936, + "learning_rate": 1.3324258977032673e-05, + "loss": 0.9888, + "step": 2420 + }, + { + "epoch": 0.4105129292072912, + "grad_norm": 0.9606946019788624, + "learning_rate": 1.3319078531285286e-05, + "loss": 1.0007, + "step": 2421 + }, + { + "epoch": 0.4106824925816024, + "grad_norm": 0.9567716672709103, + "learning_rate": 1.3313897084327835e-05, + "loss": 0.9795, + "step": 2422 + }, + { + "epoch": 0.4108520559559135, + "grad_norm": 0.9558345382828642, + "learning_rate": 1.3308714637723325e-05, + "loss": 0.996, + "step": 2423 + }, + { + "epoch": 0.4110216193302247, + "grad_norm": 0.9660114983788803, + "learning_rate": 1.3303531193035053e-05, + "loss": 0.965, + "step": 2424 + }, + { + "epoch": 0.4111911827045358, + "grad_norm": 0.9651912507080008, + "learning_rate": 1.3298346751826624e-05, + "loss": 0.9811, + "step": 2425 + }, + { + "epoch": 0.411360746078847, + "grad_norm": 0.9551552558269004, + "learning_rate": 1.3293161315661934e-05, + "loss": 0.968, + "step": 2426 + }, + { + "epoch": 0.4115303094531581, + "grad_norm": 0.9955117454894147, + "learning_rate": 1.328797488610519e-05, + "loss": 1.0007, + "step": 2427 + }, + { + "epoch": 0.4116998728274693, + "grad_norm": 0.9635515621972542, + "learning_rate": 1.3282787464720897e-05, + "loss": 0.96, + "step": 2428 + }, + { + "epoch": 0.4118694362017804, + "grad_norm": 0.99811931315809, + "learning_rate": 1.3277599053073848e-05, + "loss": 0.9719, + "step": 2429 + }, + { + "epoch": 0.4120389995760916, + "grad_norm": 0.9452590616724025, + "learning_rate": 1.3272409652729152e-05, + "loss": 0.9541, + "step": 2430 + }, + { + "epoch": 0.4122085629504027, + "grad_norm": 0.9680828373453652, + "learning_rate": 1.3267219265252202e-05, + "loss": 0.9933, + "step": 2431 + }, + { + "epoch": 0.4123781263247139, + "grad_norm": 0.7458563099691555, + "learning_rate": 1.3262027892208696e-05, + "loss": 0.9038, + "step": 2432 + }, + { + "epoch": 0.412547689699025, + "grad_norm": 0.9523866219627274, + "learning_rate": 1.3256835535164622e-05, + "loss": 0.9705, + "step": 2433 + }, + { + "epoch": 0.4127172530733362, + "grad_norm": 0.9678065981380277, + "learning_rate": 1.325164219568628e-05, + "loss": 0.9998, + "step": 2434 + }, + { + "epoch": 0.4128868164476473, + "grad_norm": 1.016278860103581, + "learning_rate": 1.3246447875340249e-05, + "loss": 0.9837, + "step": 2435 + }, + { + "epoch": 0.4130563798219585, + "grad_norm": 0.9459179710413834, + "learning_rate": 1.3241252575693417e-05, + "loss": 0.9539, + "step": 2436 + }, + { + "epoch": 0.4132259431962696, + "grad_norm": 0.9735299517478437, + "learning_rate": 1.3236056298312957e-05, + "loss": 0.9912, + "step": 2437 + }, + { + "epoch": 0.4133955065705808, + "grad_norm": 0.9963518744501685, + "learning_rate": 1.3230859044766342e-05, + "loss": 1.0197, + "step": 2438 + }, + { + "epoch": 0.4135650699448919, + "grad_norm": 0.9350838329085976, + "learning_rate": 1.3225660816621342e-05, + "loss": 0.9614, + "step": 2439 + }, + { + "epoch": 0.41373463331920307, + "grad_norm": 0.9336990141134371, + "learning_rate": 1.3220461615446015e-05, + "loss": 0.9454, + "step": 2440 + }, + { + "epoch": 0.4139041966935142, + "grad_norm": 0.6081181731888237, + "learning_rate": 1.3215261442808718e-05, + "loss": 0.839, + "step": 2441 + }, + { + "epoch": 0.41407376006782537, + "grad_norm": 0.950632744913241, + "learning_rate": 1.3210060300278097e-05, + "loss": 0.9796, + "step": 2442 + }, + { + "epoch": 0.4142433234421365, + "grad_norm": 0.949278231400373, + "learning_rate": 1.3204858189423097e-05, + "loss": 0.9995, + "step": 2443 + }, + { + "epoch": 0.41441288681644767, + "grad_norm": 0.9553885275517086, + "learning_rate": 1.3199655111812945e-05, + "loss": 0.9502, + "step": 2444 + }, + { + "epoch": 0.4145824501907588, + "grad_norm": 0.963076422095927, + "learning_rate": 1.319445106901717e-05, + "loss": 1.0298, + "step": 2445 + }, + { + "epoch": 0.41475201356506997, + "grad_norm": 1.0010508289634494, + "learning_rate": 1.3189246062605582e-05, + "loss": 1.0079, + "step": 2446 + }, + { + "epoch": 0.4149215769393811, + "grad_norm": 0.9907056753153181, + "learning_rate": 1.3184040094148289e-05, + "loss": 0.9929, + "step": 2447 + }, + { + "epoch": 0.41509114031369226, + "grad_norm": 1.0213781609836017, + "learning_rate": 1.3178833165215687e-05, + "loss": 1.005, + "step": 2448 + }, + { + "epoch": 0.4152607036880034, + "grad_norm": 1.0003971571997028, + "learning_rate": 1.3173625277378464e-05, + "loss": 0.9928, + "step": 2449 + }, + { + "epoch": 0.41543026706231456, + "grad_norm": 0.9770890721532138, + "learning_rate": 1.3168416432207594e-05, + "loss": 0.9589, + "step": 2450 + }, + { + "epoch": 0.4155998304366257, + "grad_norm": 1.024539082736634, + "learning_rate": 1.3163206631274337e-05, + "loss": 1.0335, + "step": 2451 + }, + { + "epoch": 0.41576939381093686, + "grad_norm": 0.9732741774989216, + "learning_rate": 1.3157995876150252e-05, + "loss": 0.9469, + "step": 2452 + }, + { + "epoch": 0.415938957185248, + "grad_norm": 0.9888428976384802, + "learning_rate": 1.315278416840717e-05, + "loss": 0.9646, + "step": 2453 + }, + { + "epoch": 0.41610852055955916, + "grad_norm": 0.9411013235231769, + "learning_rate": 1.314757150961723e-05, + "loss": 0.9785, + "step": 2454 + }, + { + "epoch": 0.4162780839338703, + "grad_norm": 0.9605222670781098, + "learning_rate": 1.3142357901352839e-05, + "loss": 1.0101, + "step": 2455 + }, + { + "epoch": 0.41644764730818146, + "grad_norm": 0.9640322397951829, + "learning_rate": 1.3137143345186696e-05, + "loss": 0.9961, + "step": 2456 + }, + { + "epoch": 0.4166172106824926, + "grad_norm": 0.9871735762765294, + "learning_rate": 1.3131927842691793e-05, + "loss": 0.9745, + "step": 2457 + }, + { + "epoch": 0.41678677405680375, + "grad_norm": 0.9702861095375536, + "learning_rate": 1.3126711395441396e-05, + "loss": 1.0003, + "step": 2458 + }, + { + "epoch": 0.4169563374311149, + "grad_norm": 0.9280097044443271, + "learning_rate": 1.3121494005009068e-05, + "loss": 1.012, + "step": 2459 + }, + { + "epoch": 0.41712590080542605, + "grad_norm": 0.9793618045601942, + "learning_rate": 1.3116275672968646e-05, + "loss": 1.054, + "step": 2460 + }, + { + "epoch": 0.4172954641797372, + "grad_norm": 1.0175611513200622, + "learning_rate": 1.311105640089426e-05, + "loss": 1.0218, + "step": 2461 + }, + { + "epoch": 0.41746502755404835, + "grad_norm": 0.9340913131437807, + "learning_rate": 1.3105836190360315e-05, + "loss": 0.9637, + "step": 2462 + }, + { + "epoch": 0.4176345909283595, + "grad_norm": 0.966778154770852, + "learning_rate": 1.3100615042941506e-05, + "loss": 1.0387, + "step": 2463 + }, + { + "epoch": 0.41780415430267065, + "grad_norm": 1.0002857021794538, + "learning_rate": 1.3095392960212807e-05, + "loss": 0.9729, + "step": 2464 + }, + { + "epoch": 0.41797371767698177, + "grad_norm": 0.8835986390395162, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.9784, + "step": 2465 + }, + { + "epoch": 0.41814328105129295, + "grad_norm": 0.9830337720650365, + "learning_rate": 1.308494599512705e-05, + "loss": 1.0189, + "step": 2466 + }, + { + "epoch": 0.41831284442560407, + "grad_norm": 0.9987921704690718, + "learning_rate": 1.3079721115921348e-05, + "loss": 0.9661, + "step": 2467 + }, + { + "epoch": 0.41848240779991525, + "grad_norm": 0.9430457469859373, + "learning_rate": 1.3074495307708475e-05, + "loss": 0.9474, + "step": 2468 + }, + { + "epoch": 0.41865197117422637, + "grad_norm": 0.9742792090624257, + "learning_rate": 1.3069268572064808e-05, + "loss": 0.9948, + "step": 2469 + }, + { + "epoch": 0.4188215345485375, + "grad_norm": 0.9257353706271327, + "learning_rate": 1.3064040910567008e-05, + "loss": 0.955, + "step": 2470 + }, + { + "epoch": 0.41899109792284867, + "grad_norm": 0.9883164111711251, + "learning_rate": 1.3058812324792014e-05, + "loss": 1.0006, + "step": 2471 + }, + { + "epoch": 0.4191606612971598, + "grad_norm": 0.9394364750290908, + "learning_rate": 1.305358281631705e-05, + "loss": 0.9776, + "step": 2472 + }, + { + "epoch": 0.41933022467147096, + "grad_norm": 0.9822555780458035, + "learning_rate": 1.3048352386719605e-05, + "loss": 1.0135, + "step": 2473 + }, + { + "epoch": 0.4194997880457821, + "grad_norm": 0.9941729915918995, + "learning_rate": 1.3043121037577463e-05, + "loss": 1.007, + "step": 2474 + }, + { + "epoch": 0.41966935142009326, + "grad_norm": 1.0148428321185825, + "learning_rate": 1.3037888770468667e-05, + "loss": 0.9833, + "step": 2475 + }, + { + "epoch": 0.4198389147944044, + "grad_norm": 0.95849866287929, + "learning_rate": 1.3032655586971552e-05, + "loss": 0.9584, + "step": 2476 + }, + { + "epoch": 0.42000847816871556, + "grad_norm": 1.0275166336153458, + "learning_rate": 1.3027421488664723e-05, + "loss": 0.9512, + "step": 2477 + }, + { + "epoch": 0.4201780415430267, + "grad_norm": 0.9791400657686129, + "learning_rate": 1.302218647712706e-05, + "loss": 0.9827, + "step": 2478 + }, + { + "epoch": 0.42034760491733786, + "grad_norm": 0.9339238893605596, + "learning_rate": 1.301695055393772e-05, + "loss": 0.9782, + "step": 2479 + }, + { + "epoch": 0.420517168291649, + "grad_norm": 1.0596822737710774, + "learning_rate": 1.3011713720676133e-05, + "loss": 0.9606, + "step": 2480 + }, + { + "epoch": 0.42068673166596016, + "grad_norm": 1.0550604314680185, + "learning_rate": 1.3006475978922013e-05, + "loss": 1.0004, + "step": 2481 + }, + { + "epoch": 0.4208562950402713, + "grad_norm": 0.9867194430901839, + "learning_rate": 1.3001237330255334e-05, + "loss": 1.0204, + "step": 2482 + }, + { + "epoch": 0.42102585841458245, + "grad_norm": 0.9209582580814403, + "learning_rate": 1.2995997776256352e-05, + "loss": 0.9282, + "step": 2483 + }, + { + "epoch": 0.4211954217888936, + "grad_norm": 0.9472748276621806, + "learning_rate": 1.2990757318505598e-05, + "loss": 0.9451, + "step": 2484 + }, + { + "epoch": 0.42136498516320475, + "grad_norm": 0.9682508048531342, + "learning_rate": 1.2985515958583865e-05, + "loss": 0.9606, + "step": 2485 + }, + { + "epoch": 0.4215345485375159, + "grad_norm": 1.0236539166142857, + "learning_rate": 1.2980273698072228e-05, + "loss": 0.9932, + "step": 2486 + }, + { + "epoch": 0.42170411191182705, + "grad_norm": 0.9776957575892024, + "learning_rate": 1.297503053855203e-05, + "loss": 0.977, + "step": 2487 + }, + { + "epoch": 0.4218736752861382, + "grad_norm": 0.9281053374057722, + "learning_rate": 1.2969786481604891e-05, + "loss": 0.9496, + "step": 2488 + }, + { + "epoch": 0.42204323866044935, + "grad_norm": 1.0197700263964224, + "learning_rate": 1.2964541528812689e-05, + "loss": 0.9932, + "step": 2489 + }, + { + "epoch": 0.42221280203476047, + "grad_norm": 0.9743752641167894, + "learning_rate": 1.2959295681757583e-05, + "loss": 0.9901, + "step": 2490 + }, + { + "epoch": 0.42238236540907165, + "grad_norm": 0.9436189553795822, + "learning_rate": 1.2954048942022002e-05, + "loss": 0.9315, + "step": 2491 + }, + { + "epoch": 0.42255192878338277, + "grad_norm": 0.9865779154314379, + "learning_rate": 1.2948801311188637e-05, + "loss": 0.9777, + "step": 2492 + }, + { + "epoch": 0.42272149215769395, + "grad_norm": 1.0094906192281845, + "learning_rate": 1.2943552790840452e-05, + "loss": 1.0358, + "step": 2493 + }, + { + "epoch": 0.42289105553200507, + "grad_norm": 0.8994432905089988, + "learning_rate": 1.293830338256068e-05, + "loss": 0.9449, + "step": 2494 + }, + { + "epoch": 0.42306061890631624, + "grad_norm": 0.9973310313175829, + "learning_rate": 1.2933053087932821e-05, + "loss": 1.0106, + "step": 2495 + }, + { + "epoch": 0.42323018228062737, + "grad_norm": 1.0172593455287209, + "learning_rate": 1.292780190854064e-05, + "loss": 0.9827, + "step": 2496 + }, + { + "epoch": 0.42339974565493854, + "grad_norm": 0.9995213856574289, + "learning_rate": 1.2922549845968174e-05, + "loss": 1.0188, + "step": 2497 + }, + { + "epoch": 0.42356930902924966, + "grad_norm": 0.9379400401170712, + "learning_rate": 1.291729690179972e-05, + "loss": 0.9731, + "step": 2498 + }, + { + "epoch": 0.42373887240356084, + "grad_norm": 0.9871638855286147, + "learning_rate": 1.291204307761985e-05, + "loss": 0.9695, + "step": 2499 + }, + { + "epoch": 0.42390843577787196, + "grad_norm": 0.9900282879531233, + "learning_rate": 1.2906788375013392e-05, + "loss": 1.0391, + "step": 2500 + }, + { + "epoch": 0.42407799915218314, + "grad_norm": 0.9419971756374754, + "learning_rate": 1.2901532795565444e-05, + "loss": 0.9292, + "step": 2501 + }, + { + "epoch": 0.42424756252649426, + "grad_norm": 0.9778386004005973, + "learning_rate": 1.2896276340861367e-05, + "loss": 0.9824, + "step": 2502 + }, + { + "epoch": 0.42441712590080544, + "grad_norm": 0.9818189876576768, + "learning_rate": 1.2891019012486785e-05, + "loss": 0.9702, + "step": 2503 + }, + { + "epoch": 0.42458668927511656, + "grad_norm": 0.9540525657742854, + "learning_rate": 1.288576081202759e-05, + "loss": 0.9513, + "step": 2504 + }, + { + "epoch": 0.42475625264942773, + "grad_norm": 0.9644918591182482, + "learning_rate": 1.2880501741069931e-05, + "loss": 1.0144, + "step": 2505 + }, + { + "epoch": 0.42492581602373886, + "grad_norm": 0.9802432592505497, + "learning_rate": 1.2875241801200224e-05, + "loss": 0.9853, + "step": 2506 + }, + { + "epoch": 0.42509537939805003, + "grad_norm": 0.9852407281876533, + "learning_rate": 1.2869980994005146e-05, + "loss": 0.9908, + "step": 2507 + }, + { + "epoch": 0.42526494277236115, + "grad_norm": 0.9316066047742487, + "learning_rate": 1.2864719321071638e-05, + "loss": 0.9316, + "step": 2508 + }, + { + "epoch": 0.42543450614667233, + "grad_norm": 0.9731369099438988, + "learning_rate": 1.2859456783986892e-05, + "loss": 1.0024, + "step": 2509 + }, + { + "epoch": 0.42560406952098345, + "grad_norm": 0.9481133291884946, + "learning_rate": 1.2854193384338378e-05, + "loss": 0.9773, + "step": 2510 + }, + { + "epoch": 0.42577363289529463, + "grad_norm": 0.6781756198370876, + "learning_rate": 1.2848929123713811e-05, + "loss": 0.8231, + "step": 2511 + }, + { + "epoch": 0.42594319626960575, + "grad_norm": 1.0004076598531444, + "learning_rate": 1.2843664003701168e-05, + "loss": 1.0189, + "step": 2512 + }, + { + "epoch": 0.4261127596439169, + "grad_norm": 1.0070122966388144, + "learning_rate": 1.2838398025888695e-05, + "loss": 1.0389, + "step": 2513 + }, + { + "epoch": 0.42628232301822805, + "grad_norm": 0.972491172104366, + "learning_rate": 1.2833131191864884e-05, + "loss": 0.9646, + "step": 2514 + }, + { + "epoch": 0.4264518863925392, + "grad_norm": 1.0230189239965226, + "learning_rate": 1.2827863503218496e-05, + "loss": 1.0017, + "step": 2515 + }, + { + "epoch": 0.42662144976685035, + "grad_norm": 0.9849900602309155, + "learning_rate": 1.2822594961538544e-05, + "loss": 0.9854, + "step": 2516 + }, + { + "epoch": 0.4267910131411615, + "grad_norm": 1.000557638063349, + "learning_rate": 1.2817325568414299e-05, + "loss": 0.9842, + "step": 2517 + }, + { + "epoch": 0.42696057651547265, + "grad_norm": 0.9751420613359931, + "learning_rate": 1.2812055325435289e-05, + "loss": 0.9917, + "step": 2518 + }, + { + "epoch": 0.4271301398897838, + "grad_norm": 0.6468111546759283, + "learning_rate": 1.2806784234191298e-05, + "loss": 0.8158, + "step": 2519 + }, + { + "epoch": 0.42729970326409494, + "grad_norm": 0.9672189353178875, + "learning_rate": 1.280151229627237e-05, + "loss": 0.9963, + "step": 2520 + }, + { + "epoch": 0.4274692666384061, + "grad_norm": 1.0032015491615498, + "learning_rate": 1.2796239513268796e-05, + "loss": 0.9927, + "step": 2521 + }, + { + "epoch": 0.42763883001271724, + "grad_norm": 1.0432935191736354, + "learning_rate": 1.2790965886771135e-05, + "loss": 1.0004, + "step": 2522 + }, + { + "epoch": 0.4278083933870284, + "grad_norm": 0.995297815162946, + "learning_rate": 1.2785691418370178e-05, + "loss": 0.9983, + "step": 2523 + }, + { + "epoch": 0.42797795676133954, + "grad_norm": 0.9997405957886893, + "learning_rate": 1.2780416109657001e-05, + "loss": 1.0097, + "step": 2524 + }, + { + "epoch": 0.4281475201356507, + "grad_norm": 0.9593389358873918, + "learning_rate": 1.2775139962222905e-05, + "loss": 1.0078, + "step": 2525 + }, + { + "epoch": 0.42831708350996184, + "grad_norm": 0.6399708630769928, + "learning_rate": 1.276986297765946e-05, + "loss": 0.8159, + "step": 2526 + }, + { + "epoch": 0.428486646884273, + "grad_norm": 1.009811719388693, + "learning_rate": 1.2764585157558486e-05, + "loss": 0.9422, + "step": 2527 + }, + { + "epoch": 0.42865621025858414, + "grad_norm": 1.004243959315471, + "learning_rate": 1.2759306503512052e-05, + "loss": 0.9962, + "step": 2528 + }, + { + "epoch": 0.4288257736328953, + "grad_norm": 0.9986222219718599, + "learning_rate": 1.275402701711248e-05, + "loss": 0.9843, + "step": 2529 + }, + { + "epoch": 0.42899533700720643, + "grad_norm": 1.0261847797792232, + "learning_rate": 1.2748746699952338e-05, + "loss": 0.9682, + "step": 2530 + }, + { + "epoch": 0.4291649003815176, + "grad_norm": 0.9554607464796744, + "learning_rate": 1.274346555362446e-05, + "loss": 0.9894, + "step": 2531 + }, + { + "epoch": 0.42933446375582873, + "grad_norm": 0.981234836072138, + "learning_rate": 1.273818357972191e-05, + "loss": 0.9707, + "step": 2532 + }, + { + "epoch": 0.4295040271301399, + "grad_norm": 0.9965098430352564, + "learning_rate": 1.2732900779838016e-05, + "loss": 1.0072, + "step": 2533 + }, + { + "epoch": 0.42967359050445103, + "grad_norm": 1.0277003286550126, + "learning_rate": 1.272761715556635e-05, + "loss": 0.9675, + "step": 2534 + }, + { + "epoch": 0.4298431538787622, + "grad_norm": 0.9461483265161289, + "learning_rate": 1.272233270850073e-05, + "loss": 0.9379, + "step": 2535 + }, + { + "epoch": 0.43001271725307333, + "grad_norm": 1.0126090451137943, + "learning_rate": 1.2717047440235234e-05, + "loss": 0.9867, + "step": 2536 + }, + { + "epoch": 0.4301822806273845, + "grad_norm": 0.9493238161506435, + "learning_rate": 1.2711761352364172e-05, + "loss": 0.9826, + "step": 2537 + }, + { + "epoch": 0.4303518440016956, + "grad_norm": 0.913075140400223, + "learning_rate": 1.2706474446482112e-05, + "loss": 0.9337, + "step": 2538 + }, + { + "epoch": 0.4305214073760068, + "grad_norm": 0.9647582411270827, + "learning_rate": 1.2701186724183855e-05, + "loss": 0.98, + "step": 2539 + }, + { + "epoch": 0.4306909707503179, + "grad_norm": 0.9763555649713409, + "learning_rate": 1.2695898187064475e-05, + "loss": 0.9875, + "step": 2540 + }, + { + "epoch": 0.4308605341246291, + "grad_norm": 0.9163409701928596, + "learning_rate": 1.2690608836719261e-05, + "loss": 0.9838, + "step": 2541 + }, + { + "epoch": 0.4310300974989402, + "grad_norm": 0.9645401106782511, + "learning_rate": 1.2685318674743769e-05, + "loss": 0.9552, + "step": 2542 + }, + { + "epoch": 0.4311996608732514, + "grad_norm": 0.9547156108507818, + "learning_rate": 1.2680027702733791e-05, + "loss": 0.969, + "step": 2543 + }, + { + "epoch": 0.4313692242475625, + "grad_norm": 0.9453415634420721, + "learning_rate": 1.2674735922285362e-05, + "loss": 1.017, + "step": 2544 + }, + { + "epoch": 0.4315387876218737, + "grad_norm": 0.9542884266710483, + "learning_rate": 1.2669443334994768e-05, + "loss": 1.0129, + "step": 2545 + }, + { + "epoch": 0.4317083509961848, + "grad_norm": 0.8902177338216921, + "learning_rate": 1.2664149942458533e-05, + "loss": 0.9082, + "step": 2546 + }, + { + "epoch": 0.431877914370496, + "grad_norm": 0.9592890897744861, + "learning_rate": 1.265885574627342e-05, + "loss": 1.0147, + "step": 2547 + }, + { + "epoch": 0.4320474777448071, + "grad_norm": 0.9369973307062465, + "learning_rate": 1.2653560748036443e-05, + "loss": 0.9926, + "step": 2548 + }, + { + "epoch": 0.4322170411191183, + "grad_norm": 0.9995794567502263, + "learning_rate": 1.2648264949344858e-05, + "loss": 1.0128, + "step": 2549 + }, + { + "epoch": 0.4323866044934294, + "grad_norm": 0.9577114777321886, + "learning_rate": 1.2642968351796153e-05, + "loss": 0.9889, + "step": 2550 + }, + { + "epoch": 0.4325561678677406, + "grad_norm": 0.972460664381493, + "learning_rate": 1.2637670956988062e-05, + "loss": 0.9637, + "step": 2551 + }, + { + "epoch": 0.4327257312420517, + "grad_norm": 0.9480056743666075, + "learning_rate": 1.2632372766518564e-05, + "loss": 0.9917, + "step": 2552 + }, + { + "epoch": 0.4328952946163629, + "grad_norm": 0.987207085084289, + "learning_rate": 1.262707378198587e-05, + "loss": 0.986, + "step": 2553 + }, + { + "epoch": 0.433064857990674, + "grad_norm": 0.9688003378177562, + "learning_rate": 1.2621774004988438e-05, + "loss": 0.9624, + "step": 2554 + }, + { + "epoch": 0.4332344213649852, + "grad_norm": 0.9736360650615529, + "learning_rate": 1.2616473437124962e-05, + "loss": 0.982, + "step": 2555 + }, + { + "epoch": 0.4334039847392963, + "grad_norm": 0.932054823363419, + "learning_rate": 1.2611172079994377e-05, + "loss": 0.9915, + "step": 2556 + }, + { + "epoch": 0.4335735481136075, + "grad_norm": 0.9628791706496478, + "learning_rate": 1.2605869935195844e-05, + "loss": 0.9658, + "step": 2557 + }, + { + "epoch": 0.4337431114879186, + "grad_norm": 0.9466869512734183, + "learning_rate": 1.2600567004328781e-05, + "loss": 0.982, + "step": 2558 + }, + { + "epoch": 0.4339126748622298, + "grad_norm": 0.9830992644340674, + "learning_rate": 1.2595263288992825e-05, + "loss": 1.0121, + "step": 2559 + }, + { + "epoch": 0.4340822382365409, + "grad_norm": 0.6399999733775902, + "learning_rate": 1.2589958790787864e-05, + "loss": 0.7935, + "step": 2560 + }, + { + "epoch": 0.43425180161085203, + "grad_norm": 0.9634089542685117, + "learning_rate": 1.2584653511314012e-05, + "loss": 1.0236, + "step": 2561 + }, + { + "epoch": 0.4344213649851632, + "grad_norm": 0.9101099869920799, + "learning_rate": 1.2579347452171624e-05, + "loss": 0.9844, + "step": 2562 + }, + { + "epoch": 0.4345909283594743, + "grad_norm": 0.9837374995386899, + "learning_rate": 1.257404061496129e-05, + "loss": 0.9849, + "step": 2563 + }, + { + "epoch": 0.4347604917337855, + "grad_norm": 0.9748203258991123, + "learning_rate": 1.2568733001283828e-05, + "loss": 0.8952, + "step": 2564 + }, + { + "epoch": 0.4349300551080966, + "grad_norm": 0.9797331751858684, + "learning_rate": 1.2563424612740307e-05, + "loss": 1.0237, + "step": 2565 + }, + { + "epoch": 0.4350996184824078, + "grad_norm": 0.9601441620849211, + "learning_rate": 1.2558115450932006e-05, + "loss": 0.9633, + "step": 2566 + }, + { + "epoch": 0.4352691818567189, + "grad_norm": 0.9580021719682639, + "learning_rate": 1.2552805517460457e-05, + "loss": 1.0009, + "step": 2567 + }, + { + "epoch": 0.4354387452310301, + "grad_norm": 0.9323725512934854, + "learning_rate": 1.2547494813927417e-05, + "loss": 0.9685, + "step": 2568 + }, + { + "epoch": 0.4356083086053412, + "grad_norm": 0.9844191223256091, + "learning_rate": 1.2542183341934873e-05, + "loss": 0.9776, + "step": 2569 + }, + { + "epoch": 0.4357778719796524, + "grad_norm": 0.9310079039827244, + "learning_rate": 1.2536871103085044e-05, + "loss": 0.965, + "step": 2570 + }, + { + "epoch": 0.4359474353539635, + "grad_norm": 1.0013468705229542, + "learning_rate": 1.253155809898039e-05, + "loss": 0.9973, + "step": 2571 + }, + { + "epoch": 0.4361169987282747, + "grad_norm": 0.9992608024643401, + "learning_rate": 1.2526244331223592e-05, + "loss": 1.0512, + "step": 2572 + }, + { + "epoch": 0.4362865621025858, + "grad_norm": 0.9585919912207673, + "learning_rate": 1.252092980141756e-05, + "loss": 0.9713, + "step": 2573 + }, + { + "epoch": 0.436456125476897, + "grad_norm": 0.9658001415385551, + "learning_rate": 1.2515614511165447e-05, + "loss": 0.9539, + "step": 2574 + }, + { + "epoch": 0.4366256888512081, + "grad_norm": 0.9542127820120717, + "learning_rate": 1.2510298462070619e-05, + "loss": 0.9708, + "step": 2575 + }, + { + "epoch": 0.4367952522255193, + "grad_norm": 0.9662864977905982, + "learning_rate": 1.250498165573668e-05, + "loss": 0.9317, + "step": 2576 + }, + { + "epoch": 0.4369648155998304, + "grad_norm": 0.9156334945671237, + "learning_rate": 1.2499664093767458e-05, + "loss": 0.9293, + "step": 2577 + }, + { + "epoch": 0.4371343789741416, + "grad_norm": 0.9710372354065148, + "learning_rate": 1.2494345777767016e-05, + "loss": 0.9919, + "step": 2578 + }, + { + "epoch": 0.4373039423484527, + "grad_norm": 0.6040330291387417, + "learning_rate": 1.2489026709339639e-05, + "loss": 0.8991, + "step": 2579 + }, + { + "epoch": 0.4374735057227639, + "grad_norm": 0.9034640212551738, + "learning_rate": 1.2483706890089838e-05, + "loss": 0.9296, + "step": 2580 + }, + { + "epoch": 0.437643069097075, + "grad_norm": 0.9465241878484302, + "learning_rate": 1.2478386321622356e-05, + "loss": 0.9864, + "step": 2581 + }, + { + "epoch": 0.4378126324713862, + "grad_norm": 0.9649682466568654, + "learning_rate": 1.2473065005542155e-05, + "loss": 0.977, + "step": 2582 + }, + { + "epoch": 0.4379821958456973, + "grad_norm": 0.9391603902914201, + "learning_rate": 1.246774294345443e-05, + "loss": 0.951, + "step": 2583 + }, + { + "epoch": 0.4381517592200085, + "grad_norm": 0.9718981572288251, + "learning_rate": 1.2462420136964595e-05, + "loss": 1.0163, + "step": 2584 + }, + { + "epoch": 0.4383213225943196, + "grad_norm": 0.9549219726982074, + "learning_rate": 1.245709658767829e-05, + "loss": 0.9852, + "step": 2585 + }, + { + "epoch": 0.4384908859686308, + "grad_norm": 0.9676843916961262, + "learning_rate": 1.2451772297201376e-05, + "loss": 0.9737, + "step": 2586 + }, + { + "epoch": 0.4386604493429419, + "grad_norm": 0.9482472010036822, + "learning_rate": 1.2446447267139948e-05, + "loss": 0.9535, + "step": 2587 + }, + { + "epoch": 0.4388300127172531, + "grad_norm": 0.5843635488579886, + "learning_rate": 1.2441121499100318e-05, + "loss": 0.8307, + "step": 2588 + }, + { + "epoch": 0.4389995760915642, + "grad_norm": 0.9511283693232185, + "learning_rate": 1.243579499468901e-05, + "loss": 0.9857, + "step": 2589 + }, + { + "epoch": 0.4391691394658754, + "grad_norm": 0.9815884173773686, + "learning_rate": 1.2430467755512794e-05, + "loss": 0.9869, + "step": 2590 + }, + { + "epoch": 0.4393387028401865, + "grad_norm": 1.005198212889262, + "learning_rate": 1.2425139783178634e-05, + "loss": 1.0149, + "step": 2591 + }, + { + "epoch": 0.4395082662144977, + "grad_norm": 0.9563482175108111, + "learning_rate": 1.2419811079293742e-05, + "loss": 0.9613, + "step": 2592 + }, + { + "epoch": 0.4396778295888088, + "grad_norm": 0.9167312685775793, + "learning_rate": 1.241448164546553e-05, + "loss": 0.9635, + "step": 2593 + }, + { + "epoch": 0.43984739296312, + "grad_norm": 0.9525244869856538, + "learning_rate": 1.240915148330164e-05, + "loss": 0.9872, + "step": 2594 + }, + { + "epoch": 0.4400169563374311, + "grad_norm": 0.9721471457636425, + "learning_rate": 1.2403820594409926e-05, + "loss": 0.9947, + "step": 2595 + }, + { + "epoch": 0.4401865197117423, + "grad_norm": 0.9785457440196526, + "learning_rate": 1.2398488980398473e-05, + "loss": 0.9681, + "step": 2596 + }, + { + "epoch": 0.4403560830860534, + "grad_norm": 0.9128257224099869, + "learning_rate": 1.2393156642875579e-05, + "loss": 0.9401, + "step": 2597 + }, + { + "epoch": 0.4405256464603646, + "grad_norm": 0.9427585576111754, + "learning_rate": 1.2387823583449757e-05, + "loss": 0.9853, + "step": 2598 + }, + { + "epoch": 0.4406952098346757, + "grad_norm": 0.9848384603759787, + "learning_rate": 1.238248980372974e-05, + "loss": 1.0284, + "step": 2599 + }, + { + "epoch": 0.44086477320898687, + "grad_norm": 0.9255812009225528, + "learning_rate": 1.237715530532448e-05, + "loss": 0.9253, + "step": 2600 + }, + { + "epoch": 0.441034336583298, + "grad_norm": 1.0086860783906586, + "learning_rate": 1.2371820089843145e-05, + "loss": 0.9788, + "step": 2601 + }, + { + "epoch": 0.44120389995760917, + "grad_norm": 1.0099654358376329, + "learning_rate": 1.2366484158895118e-05, + "loss": 0.9757, + "step": 2602 + }, + { + "epoch": 0.4413734633319203, + "grad_norm": 0.960186951386786, + "learning_rate": 1.236114751409e-05, + "loss": 0.9829, + "step": 2603 + }, + { + "epoch": 0.44154302670623147, + "grad_norm": 0.9204818062401579, + "learning_rate": 1.2355810157037601e-05, + "loss": 0.9858, + "step": 2604 + }, + { + "epoch": 0.4417125900805426, + "grad_norm": 0.9393047723784934, + "learning_rate": 1.2350472089347957e-05, + "loss": 0.9701, + "step": 2605 + }, + { + "epoch": 0.44188215345485377, + "grad_norm": 0.9031940585296269, + "learning_rate": 1.2345133312631313e-05, + "loss": 0.9427, + "step": 2606 + }, + { + "epoch": 0.4420517168291649, + "grad_norm": 0.9290010286306092, + "learning_rate": 1.2339793828498119e-05, + "loss": 0.9439, + "step": 2607 + }, + { + "epoch": 0.44222128020347606, + "grad_norm": 0.987211699776085, + "learning_rate": 1.2334453638559057e-05, + "loss": 0.9914, + "step": 2608 + }, + { + "epoch": 0.4423908435777872, + "grad_norm": 0.9936535892667405, + "learning_rate": 1.2329112744425e-05, + "loss": 0.9719, + "step": 2609 + }, + { + "epoch": 0.44256040695209836, + "grad_norm": 1.0104702389839972, + "learning_rate": 1.2323771147707055e-05, + "loss": 0.9706, + "step": 2610 + }, + { + "epoch": 0.4427299703264095, + "grad_norm": 0.9063456863797922, + "learning_rate": 1.2318428850016528e-05, + "loss": 0.9349, + "step": 2611 + }, + { + "epoch": 0.44289953370072066, + "grad_norm": 0.9808512627871208, + "learning_rate": 1.2313085852964937e-05, + "loss": 1.0006, + "step": 2612 + }, + { + "epoch": 0.4430690970750318, + "grad_norm": 0.9931097127581182, + "learning_rate": 1.2307742158164012e-05, + "loss": 0.9712, + "step": 2613 + }, + { + "epoch": 0.44323866044934296, + "grad_norm": 0.9303357095269098, + "learning_rate": 1.2302397767225696e-05, + "loss": 0.9388, + "step": 2614 + }, + { + "epoch": 0.4434082238236541, + "grad_norm": 0.9613931330119423, + "learning_rate": 1.2297052681762143e-05, + "loss": 0.9804, + "step": 2615 + }, + { + "epoch": 0.44357778719796526, + "grad_norm": 1.0379705267557564, + "learning_rate": 1.2291706903385711e-05, + "loss": 0.9998, + "step": 2616 + }, + { + "epoch": 0.4437473505722764, + "grad_norm": 0.9275558113510899, + "learning_rate": 1.2286360433708976e-05, + "loss": 0.9412, + "step": 2617 + }, + { + "epoch": 0.44391691394658755, + "grad_norm": 0.9436017202419685, + "learning_rate": 1.2281013274344709e-05, + "loss": 0.9518, + "step": 2618 + }, + { + "epoch": 0.4440864773208987, + "grad_norm": 1.0008706708887538, + "learning_rate": 1.22756654269059e-05, + "loss": 0.988, + "step": 2619 + }, + { + "epoch": 0.44425604069520985, + "grad_norm": 0.9556374460075673, + "learning_rate": 1.2270316893005747e-05, + "loss": 0.95, + "step": 2620 + }, + { + "epoch": 0.444425604069521, + "grad_norm": 0.9538381043541012, + "learning_rate": 1.2264967674257647e-05, + "loss": 0.9705, + "step": 2621 + }, + { + "epoch": 0.44459516744383215, + "grad_norm": 0.9252129251895725, + "learning_rate": 1.2259617772275207e-05, + "loss": 0.9576, + "step": 2622 + }, + { + "epoch": 0.4447647308181433, + "grad_norm": 0.9525795111034339, + "learning_rate": 1.2254267188672242e-05, + "loss": 0.9808, + "step": 2623 + }, + { + "epoch": 0.44493429419245445, + "grad_norm": 0.9936359890930845, + "learning_rate": 1.2248915925062776e-05, + "loss": 0.9604, + "step": 2624 + }, + { + "epoch": 0.44510385756676557, + "grad_norm": 0.9568178953615135, + "learning_rate": 1.2243563983061029e-05, + "loss": 0.9674, + "step": 2625 + }, + { + "epoch": 0.44527342094107675, + "grad_norm": 0.9520514073465959, + "learning_rate": 1.2238211364281434e-05, + "loss": 0.9372, + "step": 2626 + }, + { + "epoch": 0.44544298431538787, + "grad_norm": 1.0517533339308636, + "learning_rate": 1.2232858070338618e-05, + "loss": 0.9738, + "step": 2627 + }, + { + "epoch": 0.44561254768969905, + "grad_norm": 1.0584344019632212, + "learning_rate": 1.2227504102847427e-05, + "loss": 0.9895, + "step": 2628 + }, + { + "epoch": 0.44578211106401017, + "grad_norm": 0.9750600176878557, + "learning_rate": 1.2222149463422898e-05, + "loss": 0.9916, + "step": 2629 + }, + { + "epoch": 0.44595167443832134, + "grad_norm": 0.9147701636518633, + "learning_rate": 1.2216794153680274e-05, + "loss": 0.9329, + "step": 2630 + }, + { + "epoch": 0.44612123781263247, + "grad_norm": 0.6178372177595367, + "learning_rate": 1.2211438175234998e-05, + "loss": 0.8118, + "step": 2631 + }, + { + "epoch": 0.44629080118694364, + "grad_norm": 0.9735646432975459, + "learning_rate": 1.2206081529702717e-05, + "loss": 0.945, + "step": 2632 + }, + { + "epoch": 0.44646036456125476, + "grad_norm": 0.9943674416421485, + "learning_rate": 1.2200724218699284e-05, + "loss": 0.9639, + "step": 2633 + }, + { + "epoch": 0.44662992793556594, + "grad_norm": 0.995016939721035, + "learning_rate": 1.2195366243840745e-05, + "loss": 0.9676, + "step": 2634 + }, + { + "epoch": 0.44679949130987706, + "grad_norm": 0.9674588884725147, + "learning_rate": 1.219000760674335e-05, + "loss": 0.9772, + "step": 2635 + }, + { + "epoch": 0.44696905468418824, + "grad_norm": 0.9398902840377271, + "learning_rate": 1.2184648309023545e-05, + "loss": 0.9754, + "step": 2636 + }, + { + "epoch": 0.44713861805849936, + "grad_norm": 1.0079960388860927, + "learning_rate": 1.2179288352297985e-05, + "loss": 0.9858, + "step": 2637 + }, + { + "epoch": 0.44730818143281054, + "grad_norm": 1.0083448550513037, + "learning_rate": 1.217392773818351e-05, + "loss": 0.9771, + "step": 2638 + }, + { + "epoch": 0.44747774480712166, + "grad_norm": 1.0190554426314378, + "learning_rate": 1.2168566468297172e-05, + "loss": 0.9878, + "step": 2639 + }, + { + "epoch": 0.44764730818143283, + "grad_norm": 0.9794039179845009, + "learning_rate": 1.2163204544256209e-05, + "loss": 0.9631, + "step": 2640 + }, + { + "epoch": 0.44781687155574396, + "grad_norm": 0.9950696202258047, + "learning_rate": 1.2157841967678064e-05, + "loss": 0.9866, + "step": 2641 + }, + { + "epoch": 0.44798643493005513, + "grad_norm": 0.9689424486708467, + "learning_rate": 1.2152478740180374e-05, + "loss": 0.989, + "step": 2642 + }, + { + "epoch": 0.44815599830436625, + "grad_norm": 0.9883031878224542, + "learning_rate": 1.2147114863380969e-05, + "loss": 1.002, + "step": 2643 + }, + { + "epoch": 0.44832556167867743, + "grad_norm": 1.0042404715586786, + "learning_rate": 1.2141750338897887e-05, + "loss": 1.0032, + "step": 2644 + }, + { + "epoch": 0.44849512505298855, + "grad_norm": 0.9280754160907558, + "learning_rate": 1.2136385168349345e-05, + "loss": 0.9662, + "step": 2645 + }, + { + "epoch": 0.44866468842729973, + "grad_norm": 1.037477001251102, + "learning_rate": 1.2131019353353768e-05, + "loss": 1.0295, + "step": 2646 + }, + { + "epoch": 0.44883425180161085, + "grad_norm": 0.9737458468691693, + "learning_rate": 1.2125652895529766e-05, + "loss": 1.0159, + "step": 2647 + }, + { + "epoch": 0.449003815175922, + "grad_norm": 1.0301307125855186, + "learning_rate": 1.2120285796496153e-05, + "loss": 0.994, + "step": 2648 + }, + { + "epoch": 0.44917337855023315, + "grad_norm": 0.959385929367056, + "learning_rate": 1.2114918057871928e-05, + "loss": 0.9653, + "step": 2649 + }, + { + "epoch": 0.4493429419245443, + "grad_norm": 1.0705578878212247, + "learning_rate": 1.2109549681276281e-05, + "loss": 1.0292, + "step": 2650 + }, + { + "epoch": 0.44951250529885545, + "grad_norm": 0.6578480262499101, + "learning_rate": 1.2104180668328606e-05, + "loss": 0.8301, + "step": 2651 + }, + { + "epoch": 0.44968206867316657, + "grad_norm": 0.9090928796566937, + "learning_rate": 1.2098811020648475e-05, + "loss": 0.9375, + "step": 2652 + }, + { + "epoch": 0.44985163204747775, + "grad_norm": 0.974569345103213, + "learning_rate": 1.2093440739855669e-05, + "loss": 0.9744, + "step": 2653 + }, + { + "epoch": 0.45002119542178887, + "grad_norm": 0.9644635016241943, + "learning_rate": 1.2088069827570136e-05, + "loss": 0.9819, + "step": 2654 + }, + { + "epoch": 0.45019075879610004, + "grad_norm": 0.9645719831857741, + "learning_rate": 1.2082698285412037e-05, + "loss": 0.9778, + "step": 2655 + }, + { + "epoch": 0.45036032217041116, + "grad_norm": 0.9228009480152504, + "learning_rate": 1.207732611500171e-05, + "loss": 0.9023, + "step": 2656 + }, + { + "epoch": 0.45052988554472234, + "grad_norm": 0.6723316575432514, + "learning_rate": 1.2071953317959692e-05, + "loss": 0.8799, + "step": 2657 + }, + { + "epoch": 0.45069944891903346, + "grad_norm": 0.9350788993948473, + "learning_rate": 1.2066579895906699e-05, + "loss": 0.978, + "step": 2658 + }, + { + "epoch": 0.45086901229334464, + "grad_norm": 0.9794696241276973, + "learning_rate": 1.2061205850463635e-05, + "loss": 0.9451, + "step": 2659 + }, + { + "epoch": 0.45103857566765576, + "grad_norm": 0.9920009712159011, + "learning_rate": 1.2055831183251608e-05, + "loss": 0.9457, + "step": 2660 + }, + { + "epoch": 0.45120813904196694, + "grad_norm": 1.0303134553366446, + "learning_rate": 1.2050455895891893e-05, + "loss": 1.0489, + "step": 2661 + }, + { + "epoch": 0.45137770241627806, + "grad_norm": 0.9680785846350365, + "learning_rate": 1.2045079990005968e-05, + "loss": 1.0202, + "step": 2662 + }, + { + "epoch": 0.45154726579058924, + "grad_norm": 1.0188724474031694, + "learning_rate": 1.2039703467215489e-05, + "loss": 1.0019, + "step": 2663 + }, + { + "epoch": 0.45171682916490036, + "grad_norm": 0.9612586165429831, + "learning_rate": 1.20343263291423e-05, + "loss": 0.9838, + "step": 2664 + }, + { + "epoch": 0.45188639253921153, + "grad_norm": 0.9774052477580866, + "learning_rate": 1.202894857740843e-05, + "loss": 0.9451, + "step": 2665 + }, + { + "epoch": 0.45205595591352266, + "grad_norm": 0.9501449121195908, + "learning_rate": 1.2023570213636096e-05, + "loss": 0.9792, + "step": 2666 + }, + { + "epoch": 0.45222551928783383, + "grad_norm": 1.0146713244118473, + "learning_rate": 1.2018191239447698e-05, + "loss": 1.0078, + "step": 2667 + }, + { + "epoch": 0.45239508266214495, + "grad_norm": 0.9307121174618093, + "learning_rate": 1.2012811656465818e-05, + "loss": 1.0127, + "step": 2668 + }, + { + "epoch": 0.45256464603645613, + "grad_norm": 0.9269792577229103, + "learning_rate": 1.2007431466313224e-05, + "loss": 0.9774, + "step": 2669 + }, + { + "epoch": 0.45273420941076725, + "grad_norm": 0.9615505244915513, + "learning_rate": 1.2002050670612864e-05, + "loss": 0.9879, + "step": 2670 + }, + { + "epoch": 0.45290377278507843, + "grad_norm": 0.9523136343262971, + "learning_rate": 1.1996669270987878e-05, + "loss": 0.9652, + "step": 2671 + }, + { + "epoch": 0.45307333615938955, + "grad_norm": 0.9830594287381288, + "learning_rate": 1.1991287269061575e-05, + "loss": 0.9669, + "step": 2672 + }, + { + "epoch": 0.4532428995337007, + "grad_norm": 1.0010636198585519, + "learning_rate": 1.1985904666457455e-05, + "loss": 0.9936, + "step": 2673 + }, + { + "epoch": 0.45341246290801185, + "grad_norm": 0.981074246875863, + "learning_rate": 1.1980521464799197e-05, + "loss": 0.9674, + "step": 2674 + }, + { + "epoch": 0.453582026282323, + "grad_norm": 0.9763715055330665, + "learning_rate": 1.1975137665710659e-05, + "loss": 1.0242, + "step": 2675 + }, + { + "epoch": 0.45375158965663415, + "grad_norm": 1.0281671424663383, + "learning_rate": 1.1969753270815881e-05, + "loss": 0.9943, + "step": 2676 + }, + { + "epoch": 0.4539211530309453, + "grad_norm": 0.9883323699594073, + "learning_rate": 1.1964368281739078e-05, + "loss": 0.9713, + "step": 2677 + }, + { + "epoch": 0.45409071640525644, + "grad_norm": 0.9456052638683551, + "learning_rate": 1.1958982700104655e-05, + "loss": 0.97, + "step": 2678 + }, + { + "epoch": 0.4542602797795676, + "grad_norm": 0.9353048098356204, + "learning_rate": 1.1953596527537184e-05, + "loss": 0.9646, + "step": 2679 + }, + { + "epoch": 0.45442984315387874, + "grad_norm": 1.036147116714233, + "learning_rate": 1.1948209765661421e-05, + "loss": 0.9442, + "step": 2680 + }, + { + "epoch": 0.4545994065281899, + "grad_norm": 0.9992504658384659, + "learning_rate": 1.19428224161023e-05, + "loss": 0.975, + "step": 2681 + }, + { + "epoch": 0.45476896990250104, + "grad_norm": 0.9604222757124211, + "learning_rate": 1.1937434480484931e-05, + "loss": 0.9945, + "step": 2682 + }, + { + "epoch": 0.4549385332768122, + "grad_norm": 0.9317371911890258, + "learning_rate": 1.19320459604346e-05, + "loss": 0.9718, + "step": 2683 + }, + { + "epoch": 0.45510809665112334, + "grad_norm": 0.9785087040040867, + "learning_rate": 1.1926656857576773e-05, + "loss": 0.9764, + "step": 2684 + }, + { + "epoch": 0.4552776600254345, + "grad_norm": 0.9905911991290945, + "learning_rate": 1.1921267173537085e-05, + "loss": 0.9776, + "step": 2685 + }, + { + "epoch": 0.45544722339974564, + "grad_norm": 0.6049238113888165, + "learning_rate": 1.1915876909941356e-05, + "loss": 0.8157, + "step": 2686 + }, + { + "epoch": 0.4556167867740568, + "grad_norm": 1.0078670272491357, + "learning_rate": 1.191048606841557e-05, + "loss": 0.9482, + "step": 2687 + }, + { + "epoch": 0.45578635014836794, + "grad_norm": 1.0173490694790077, + "learning_rate": 1.190509465058589e-05, + "loss": 0.9599, + "step": 2688 + }, + { + "epoch": 0.4559559135226791, + "grad_norm": 0.9301070818226611, + "learning_rate": 1.1899702658078663e-05, + "loss": 0.9433, + "step": 2689 + }, + { + "epoch": 0.45612547689699023, + "grad_norm": 0.9884613714526366, + "learning_rate": 1.1894310092520387e-05, + "loss": 1.007, + "step": 2690 + }, + { + "epoch": 0.4562950402713014, + "grad_norm": 0.9771370890747765, + "learning_rate": 1.1888916955537755e-05, + "loss": 0.9754, + "step": 2691 + }, + { + "epoch": 0.45646460364561253, + "grad_norm": 0.992714320883066, + "learning_rate": 1.1883523248757619e-05, + "loss": 0.967, + "step": 2692 + }, + { + "epoch": 0.4566341670199237, + "grad_norm": 0.9590693795340549, + "learning_rate": 1.1878128973807005e-05, + "loss": 0.9833, + "step": 2693 + }, + { + "epoch": 0.45680373039423483, + "grad_norm": 0.951380598737075, + "learning_rate": 1.1872734132313121e-05, + "loss": 0.9804, + "step": 2694 + }, + { + "epoch": 0.456973293768546, + "grad_norm": 0.9591187610709947, + "learning_rate": 1.1867338725903326e-05, + "loss": 0.9467, + "step": 2695 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.9932358249053632, + "learning_rate": 1.186194275620517e-05, + "loss": 0.9951, + "step": 2696 + }, + { + "epoch": 0.4573124205171683, + "grad_norm": 0.9679722088923083, + "learning_rate": 1.1856546224846354e-05, + "loss": 0.9833, + "step": 2697 + }, + { + "epoch": 0.4574819838914794, + "grad_norm": 0.9765738436595983, + "learning_rate": 1.1851149133454769e-05, + "loss": 0.9741, + "step": 2698 + }, + { + "epoch": 0.4576515472657906, + "grad_norm": 0.9601067174873307, + "learning_rate": 1.1845751483658454e-05, + "loss": 0.9657, + "step": 2699 + }, + { + "epoch": 0.4578211106401017, + "grad_norm": 0.9584460504878817, + "learning_rate": 1.1840353277085635e-05, + "loss": 0.968, + "step": 2700 + }, + { + "epoch": 0.4579906740144129, + "grad_norm": 0.9268425539106802, + "learning_rate": 1.183495451536469e-05, + "loss": 0.9702, + "step": 2701 + }, + { + "epoch": 0.458160237388724, + "grad_norm": 0.940852000205561, + "learning_rate": 1.1829555200124176e-05, + "loss": 0.9507, + "step": 2702 + }, + { + "epoch": 0.4583298007630352, + "grad_norm": 0.604571277694446, + "learning_rate": 1.1824155332992812e-05, + "loss": 0.8064, + "step": 2703 + }, + { + "epoch": 0.4584993641373463, + "grad_norm": 0.9261476336642703, + "learning_rate": 1.1818754915599482e-05, + "loss": 0.9715, + "step": 2704 + }, + { + "epoch": 0.4586689275116575, + "grad_norm": 1.0145798687845644, + "learning_rate": 1.181335394957324e-05, + "loss": 0.963, + "step": 2705 + }, + { + "epoch": 0.4588384908859686, + "grad_norm": 1.0139823579522385, + "learning_rate": 1.1807952436543307e-05, + "loss": 1.0111, + "step": 2706 + }, + { + "epoch": 0.4590080542602798, + "grad_norm": 0.9814183347997688, + "learning_rate": 1.180255037813906e-05, + "loss": 0.9791, + "step": 2707 + }, + { + "epoch": 0.4591776176345909, + "grad_norm": 0.94513410122389, + "learning_rate": 1.1797147775990047e-05, + "loss": 0.9674, + "step": 2708 + }, + { + "epoch": 0.4593471810089021, + "grad_norm": 0.9709856232582966, + "learning_rate": 1.1791744631725983e-05, + "loss": 1.0338, + "step": 2709 + }, + { + "epoch": 0.4595167443832132, + "grad_norm": 0.6411880963375893, + "learning_rate": 1.178634094697674e-05, + "loss": 0.8622, + "step": 2710 + }, + { + "epoch": 0.4596863077575244, + "grad_norm": 0.9852955128573431, + "learning_rate": 1.1780936723372359e-05, + "loss": 0.9638, + "step": 2711 + }, + { + "epoch": 0.4598558711318355, + "grad_norm": 0.956806964981468, + "learning_rate": 1.1775531962543036e-05, + "loss": 0.9446, + "step": 2712 + }, + { + "epoch": 0.4600254345061467, + "grad_norm": 0.9294529749547876, + "learning_rate": 1.1770126666119133e-05, + "loss": 0.979, + "step": 2713 + }, + { + "epoch": 0.4601949978804578, + "grad_norm": 0.9216704746293797, + "learning_rate": 1.1764720835731179e-05, + "loss": 0.9257, + "step": 2714 + }, + { + "epoch": 0.460364561254769, + "grad_norm": 0.6127449748712405, + "learning_rate": 1.1759314473009855e-05, + "loss": 0.7867, + "step": 2715 + }, + { + "epoch": 0.4605341246290801, + "grad_norm": 0.9737663217724459, + "learning_rate": 1.175390757958601e-05, + "loss": 0.9901, + "step": 2716 + }, + { + "epoch": 0.4607036880033913, + "grad_norm": 0.9382196360529027, + "learning_rate": 1.1748500157090645e-05, + "loss": 0.9862, + "step": 2717 + }, + { + "epoch": 0.4608732513777024, + "grad_norm": 0.9331284747136757, + "learning_rate": 1.1743092207154929e-05, + "loss": 0.9538, + "step": 2718 + }, + { + "epoch": 0.4610428147520136, + "grad_norm": 1.0005362823871018, + "learning_rate": 1.1737683731410185e-05, + "loss": 0.9912, + "step": 2719 + }, + { + "epoch": 0.4612123781263247, + "grad_norm": 0.9413155920929438, + "learning_rate": 1.1732274731487899e-05, + "loss": 0.9798, + "step": 2720 + }, + { + "epoch": 0.4613819415006359, + "grad_norm": 0.9476531509194811, + "learning_rate": 1.1726865209019709e-05, + "loss": 0.9648, + "step": 2721 + }, + { + "epoch": 0.461551504874947, + "grad_norm": 0.9343942599978295, + "learning_rate": 1.1721455165637413e-05, + "loss": 0.9348, + "step": 2722 + }, + { + "epoch": 0.4617210682492582, + "grad_norm": 1.0366596084695492, + "learning_rate": 1.171604460297297e-05, + "loss": 1.013, + "step": 2723 + }, + { + "epoch": 0.4618906316235693, + "grad_norm": 0.939574373175376, + "learning_rate": 1.1710633522658488e-05, + "loss": 0.9629, + "step": 2724 + }, + { + "epoch": 0.4620601949978805, + "grad_norm": 0.9515881429800522, + "learning_rate": 1.170522192632624e-05, + "loss": 0.9613, + "step": 2725 + }, + { + "epoch": 0.4622297583721916, + "grad_norm": 0.9685835311923924, + "learning_rate": 1.1699809815608649e-05, + "loss": 0.9667, + "step": 2726 + }, + { + "epoch": 0.4623993217465028, + "grad_norm": 0.9842069987333234, + "learning_rate": 1.1694397192138295e-05, + "loss": 0.9944, + "step": 2727 + }, + { + "epoch": 0.4625688851208139, + "grad_norm": 0.9634828549725546, + "learning_rate": 1.168898405754791e-05, + "loss": 0.9901, + "step": 2728 + }, + { + "epoch": 0.4627384484951251, + "grad_norm": 0.9657462439909241, + "learning_rate": 1.1683570413470384e-05, + "loss": 1.0029, + "step": 2729 + }, + { + "epoch": 0.4629080118694362, + "grad_norm": 0.9618231854479252, + "learning_rate": 1.1678156261538762e-05, + "loss": 1.0057, + "step": 2730 + }, + { + "epoch": 0.4630775752437474, + "grad_norm": 0.9375298437319175, + "learning_rate": 1.1672741603386237e-05, + "loss": 0.9723, + "step": 2731 + }, + { + "epoch": 0.4632471386180585, + "grad_norm": 0.9323680348028757, + "learning_rate": 1.1667326440646157e-05, + "loss": 0.9541, + "step": 2732 + }, + { + "epoch": 0.4634167019923697, + "grad_norm": 0.996393404157614, + "learning_rate": 1.1661910774952019e-05, + "loss": 0.9385, + "step": 2733 + }, + { + "epoch": 0.4635862653666808, + "grad_norm": 0.7256235953469011, + "learning_rate": 1.1656494607937479e-05, + "loss": 0.8218, + "step": 2734 + }, + { + "epoch": 0.46375582874099197, + "grad_norm": 0.9632054333100473, + "learning_rate": 1.1651077941236338e-05, + "loss": 0.9491, + "step": 2735 + }, + { + "epoch": 0.4639253921153031, + "grad_norm": 0.9894093204787605, + "learning_rate": 1.164566077648255e-05, + "loss": 0.9363, + "step": 2736 + }, + { + "epoch": 0.46409495548961427, + "grad_norm": 0.6790453313878879, + "learning_rate": 1.1640243115310219e-05, + "loss": 0.8583, + "step": 2737 + }, + { + "epoch": 0.4642645188639254, + "grad_norm": 0.928207831536817, + "learning_rate": 1.1634824959353602e-05, + "loss": 0.9777, + "step": 2738 + }, + { + "epoch": 0.46443408223823657, + "grad_norm": 0.9714762841562918, + "learning_rate": 1.1629406310247098e-05, + "loss": 0.9345, + "step": 2739 + }, + { + "epoch": 0.4646036456125477, + "grad_norm": 0.9543841690265451, + "learning_rate": 1.1623987169625261e-05, + "loss": 0.9929, + "step": 2740 + }, + { + "epoch": 0.46477320898685887, + "grad_norm": 0.9496956797162672, + "learning_rate": 1.1618567539122794e-05, + "loss": 0.9785, + "step": 2741 + }, + { + "epoch": 0.46494277236117, + "grad_norm": 0.933082178641956, + "learning_rate": 1.1613147420374538e-05, + "loss": 0.9831, + "step": 2742 + }, + { + "epoch": 0.46511233573548116, + "grad_norm": 0.9904969803520491, + "learning_rate": 1.1607726815015492e-05, + "loss": 0.9887, + "step": 2743 + }, + { + "epoch": 0.4652818991097923, + "grad_norm": 0.9651584944558408, + "learning_rate": 1.1602305724680796e-05, + "loss": 1.0011, + "step": 2744 + }, + { + "epoch": 0.4654514624841034, + "grad_norm": 0.9471306037603745, + "learning_rate": 1.1596884151005743e-05, + "loss": 0.9918, + "step": 2745 + }, + { + "epoch": 0.4656210258584146, + "grad_norm": 0.990833630121136, + "learning_rate": 1.1591462095625763e-05, + "loss": 1.0523, + "step": 2746 + }, + { + "epoch": 0.4657905892327257, + "grad_norm": 0.9974612240540354, + "learning_rate": 1.1586039560176434e-05, + "loss": 0.9958, + "step": 2747 + }, + { + "epoch": 0.4659601526070369, + "grad_norm": 0.9985791810806031, + "learning_rate": 1.1580616546293485e-05, + "loss": 0.9823, + "step": 2748 + }, + { + "epoch": 0.466129715981348, + "grad_norm": 0.9574216040226441, + "learning_rate": 1.1575193055612785e-05, + "loss": 0.9768, + "step": 2749 + }, + { + "epoch": 0.4662992793556592, + "grad_norm": 0.9341144309487299, + "learning_rate": 1.1569769089770341e-05, + "loss": 0.9557, + "step": 2750 + }, + { + "epoch": 0.4664688427299703, + "grad_norm": 1.0096048343372679, + "learning_rate": 1.156434465040231e-05, + "loss": 0.9995, + "step": 2751 + }, + { + "epoch": 0.4666384061042815, + "grad_norm": 0.95710004168116, + "learning_rate": 1.1558919739144994e-05, + "loss": 0.9723, + "step": 2752 + }, + { + "epoch": 0.4668079694785926, + "grad_norm": 0.6853955382300533, + "learning_rate": 1.155349435763483e-05, + "loss": 0.7884, + "step": 2753 + }, + { + "epoch": 0.4669775328529038, + "grad_norm": 0.9961303304130427, + "learning_rate": 1.1548068507508403e-05, + "loss": 1.0039, + "step": 2754 + }, + { + "epoch": 0.4671470962272149, + "grad_norm": 0.9792270421832289, + "learning_rate": 1.1542642190402434e-05, + "loss": 0.9387, + "step": 2755 + }, + { + "epoch": 0.4673166596015261, + "grad_norm": 1.0078748568271807, + "learning_rate": 1.153721540795379e-05, + "loss": 0.9701, + "step": 2756 + }, + { + "epoch": 0.4674862229758372, + "grad_norm": 0.9350229791831816, + "learning_rate": 1.153178816179948e-05, + "loss": 0.9919, + "step": 2757 + }, + { + "epoch": 0.4676557863501484, + "grad_norm": 0.973724854481201, + "learning_rate": 1.1526360453576646e-05, + "loss": 0.9655, + "step": 2758 + }, + { + "epoch": 0.4678253497244595, + "grad_norm": 0.9272057766288919, + "learning_rate": 1.152093228492257e-05, + "loss": 0.9874, + "step": 2759 + }, + { + "epoch": 0.46799491309877067, + "grad_norm": 1.0398840623387369, + "learning_rate": 1.1515503657474678e-05, + "loss": 0.9932, + "step": 2760 + }, + { + "epoch": 0.4681644764730818, + "grad_norm": 0.946711302522535, + "learning_rate": 1.1510074572870533e-05, + "loss": 0.9671, + "step": 2761 + }, + { + "epoch": 0.46833403984739297, + "grad_norm": 0.9681468539350723, + "learning_rate": 1.1504645032747832e-05, + "loss": 1.0121, + "step": 2762 + }, + { + "epoch": 0.4685036032217041, + "grad_norm": 0.6902247782676347, + "learning_rate": 1.1499215038744413e-05, + "loss": 0.8304, + "step": 2763 + }, + { + "epoch": 0.46867316659601527, + "grad_norm": 0.9604054947185507, + "learning_rate": 1.1493784592498252e-05, + "loss": 0.9659, + "step": 2764 + }, + { + "epoch": 0.4688427299703264, + "grad_norm": 1.003480561636045, + "learning_rate": 1.1488353695647456e-05, + "loss": 0.9811, + "step": 2765 + }, + { + "epoch": 0.46901229334463757, + "grad_norm": 1.0106849453147624, + "learning_rate": 1.1482922349830279e-05, + "loss": 0.9916, + "step": 2766 + }, + { + "epoch": 0.4691818567189487, + "grad_norm": 0.9729822381769379, + "learning_rate": 1.1477490556685094e-05, + "loss": 0.9774, + "step": 2767 + }, + { + "epoch": 0.46935142009325986, + "grad_norm": 1.0088846241710518, + "learning_rate": 1.1472058317850423e-05, + "loss": 0.9927, + "step": 2768 + }, + { + "epoch": 0.469520983467571, + "grad_norm": 0.9593836706325065, + "learning_rate": 1.1466625634964911e-05, + "loss": 0.9577, + "step": 2769 + }, + { + "epoch": 0.46969054684188216, + "grad_norm": 0.988350591461139, + "learning_rate": 1.1461192509667354e-05, + "loss": 0.9923, + "step": 2770 + }, + { + "epoch": 0.4698601102161933, + "grad_norm": 0.9395954297498722, + "learning_rate": 1.145575894359666e-05, + "loss": 0.9444, + "step": 2771 + }, + { + "epoch": 0.47002967359050446, + "grad_norm": 1.0066152601115128, + "learning_rate": 1.1450324938391886e-05, + "loss": 0.9748, + "step": 2772 + }, + { + "epoch": 0.4701992369648156, + "grad_norm": 0.9831928545299302, + "learning_rate": 1.1444890495692214e-05, + "loss": 0.9345, + "step": 2773 + }, + { + "epoch": 0.47036880033912676, + "grad_norm": 0.6802422466498235, + "learning_rate": 1.1439455617136962e-05, + "loss": 0.7771, + "step": 2774 + }, + { + "epoch": 0.4705383637134379, + "grad_norm": 0.9669911260090596, + "learning_rate": 1.1434020304365578e-05, + "loss": 0.9566, + "step": 2775 + }, + { + "epoch": 0.47070792708774906, + "grad_norm": 0.9316237621690796, + "learning_rate": 1.142858455901764e-05, + "loss": 0.9789, + "step": 2776 + }, + { + "epoch": 0.4708774904620602, + "grad_norm": 0.9460866473889332, + "learning_rate": 1.1423148382732854e-05, + "loss": 0.9879, + "step": 2777 + }, + { + "epoch": 0.47104705383637135, + "grad_norm": 0.9912339021145382, + "learning_rate": 1.1417711777151061e-05, + "loss": 1.0006, + "step": 2778 + }, + { + "epoch": 0.4712166172106825, + "grad_norm": 0.9305160951666769, + "learning_rate": 1.141227474391223e-05, + "loss": 0.9832, + "step": 2779 + }, + { + "epoch": 0.47138618058499365, + "grad_norm": 0.9740815736765068, + "learning_rate": 1.1406837284656457e-05, + "loss": 1.0069, + "step": 2780 + }, + { + "epoch": 0.4715557439593048, + "grad_norm": 0.9296049467468304, + "learning_rate": 1.1401399401023974e-05, + "loss": 0.9617, + "step": 2781 + }, + { + "epoch": 0.47172530733361595, + "grad_norm": 0.9320866836270234, + "learning_rate": 1.1395961094655123e-05, + "loss": 0.9843, + "step": 2782 + }, + { + "epoch": 0.47189487070792707, + "grad_norm": 0.9353170617524206, + "learning_rate": 1.1390522367190396e-05, + "loss": 0.9668, + "step": 2783 + }, + { + "epoch": 0.47206443408223825, + "grad_norm": 0.9077143992404432, + "learning_rate": 1.13850832202704e-05, + "loss": 0.9348, + "step": 2784 + }, + { + "epoch": 0.47223399745654937, + "grad_norm": 0.9461444932481833, + "learning_rate": 1.1379643655535869e-05, + "loss": 0.9253, + "step": 2785 + }, + { + "epoch": 0.47240356083086055, + "grad_norm": 0.9762728025009527, + "learning_rate": 1.1374203674627661e-05, + "loss": 0.9704, + "step": 2786 + }, + { + "epoch": 0.47257312420517167, + "grad_norm": 0.9496984448671917, + "learning_rate": 1.1368763279186765e-05, + "loss": 0.9477, + "step": 2787 + }, + { + "epoch": 0.47274268757948285, + "grad_norm": 0.9605721320318821, + "learning_rate": 1.1363322470854294e-05, + "loss": 0.999, + "step": 2788 + }, + { + "epoch": 0.47291225095379397, + "grad_norm": 1.0018562419653159, + "learning_rate": 1.1357881251271482e-05, + "loss": 0.9615, + "step": 2789 + }, + { + "epoch": 0.47308181432810514, + "grad_norm": 0.9337119597973723, + "learning_rate": 1.1352439622079689e-05, + "loss": 0.9737, + "step": 2790 + }, + { + "epoch": 0.47325137770241626, + "grad_norm": 0.946047007858419, + "learning_rate": 1.1346997584920404e-05, + "loss": 0.9629, + "step": 2791 + }, + { + "epoch": 0.47342094107672744, + "grad_norm": 0.9411868766962924, + "learning_rate": 1.1341555141435227e-05, + "loss": 0.9581, + "step": 2792 + }, + { + "epoch": 0.47359050445103856, + "grad_norm": 0.9953648116763884, + "learning_rate": 1.1336112293265896e-05, + "loss": 0.966, + "step": 2793 + }, + { + "epoch": 0.47376006782534974, + "grad_norm": 0.9760578251261003, + "learning_rate": 1.1330669042054252e-05, + "loss": 0.9902, + "step": 2794 + }, + { + "epoch": 0.47392963119966086, + "grad_norm": 0.9861733463258744, + "learning_rate": 1.1325225389442278e-05, + "loss": 0.9835, + "step": 2795 + }, + { + "epoch": 0.47409919457397204, + "grad_norm": 0.9691020975178761, + "learning_rate": 1.131978133707206e-05, + "loss": 0.9916, + "step": 2796 + }, + { + "epoch": 0.47426875794828316, + "grad_norm": 1.0083477359357926, + "learning_rate": 1.1314336886585818e-05, + "loss": 0.9901, + "step": 2797 + }, + { + "epoch": 0.47443832132259434, + "grad_norm": 1.0021452309895236, + "learning_rate": 1.1308892039625883e-05, + "loss": 1.0061, + "step": 2798 + }, + { + "epoch": 0.47460788469690546, + "grad_norm": 0.6771713362301471, + "learning_rate": 1.1303446797834714e-05, + "loss": 0.8476, + "step": 2799 + }, + { + "epoch": 0.47477744807121663, + "grad_norm": 0.9195331223936284, + "learning_rate": 1.1298001162854883e-05, + "loss": 0.9253, + "step": 2800 + }, + { + "epoch": 0.47494701144552776, + "grad_norm": 0.9799791421900205, + "learning_rate": 1.1292555136329082e-05, + "loss": 0.9605, + "step": 2801 + }, + { + "epoch": 0.47511657481983893, + "grad_norm": 0.972421064507152, + "learning_rate": 1.1287108719900121e-05, + "loss": 0.978, + "step": 2802 + }, + { + "epoch": 0.47528613819415005, + "grad_norm": 0.9443956964967446, + "learning_rate": 1.1281661915210931e-05, + "loss": 0.9389, + "step": 2803 + }, + { + "epoch": 0.47545570156846123, + "grad_norm": 0.974196531739681, + "learning_rate": 1.127621472390455e-05, + "loss": 0.9816, + "step": 2804 + }, + { + "epoch": 0.47562526494277235, + "grad_norm": 0.967100944436867, + "learning_rate": 1.1270767147624146e-05, + "loss": 0.9669, + "step": 2805 + }, + { + "epoch": 0.47579482831708353, + "grad_norm": 0.9352863318882042, + "learning_rate": 1.1265319188012995e-05, + "loss": 0.9986, + "step": 2806 + }, + { + "epoch": 0.47596439169139465, + "grad_norm": 0.9687682470496168, + "learning_rate": 1.1259870846714488e-05, + "loss": 0.992, + "step": 2807 + }, + { + "epoch": 0.4761339550657058, + "grad_norm": 1.0189588925982813, + "learning_rate": 1.1254422125372137e-05, + "loss": 1.0128, + "step": 2808 + }, + { + "epoch": 0.47630351844001695, + "grad_norm": 0.9559207886250073, + "learning_rate": 1.1248973025629567e-05, + "loss": 0.9723, + "step": 2809 + }, + { + "epoch": 0.4764730818143281, + "grad_norm": 0.9075739820136581, + "learning_rate": 1.1243523549130509e-05, + "loss": 0.9778, + "step": 2810 + }, + { + "epoch": 0.47664264518863925, + "grad_norm": 0.9650468441790101, + "learning_rate": 1.123807369751882e-05, + "loss": 0.9732, + "step": 2811 + }, + { + "epoch": 0.4768122085629504, + "grad_norm": 0.9496596416809825, + "learning_rate": 1.1232623472438462e-05, + "loss": 0.9642, + "step": 2812 + }, + { + "epoch": 0.47698177193726154, + "grad_norm": 0.9793943299927493, + "learning_rate": 1.122717287553351e-05, + "loss": 0.9791, + "step": 2813 + }, + { + "epoch": 0.4771513353115727, + "grad_norm": 0.9434484531633723, + "learning_rate": 1.1221721908448156e-05, + "loss": 0.9885, + "step": 2814 + }, + { + "epoch": 0.47732089868588384, + "grad_norm": 0.9517378107627297, + "learning_rate": 1.1216270572826697e-05, + "loss": 0.9644, + "step": 2815 + }, + { + "epoch": 0.477490462060195, + "grad_norm": 0.9447726285358997, + "learning_rate": 1.1210818870313548e-05, + "loss": 0.9838, + "step": 2816 + }, + { + "epoch": 0.47766002543450614, + "grad_norm": 0.9791740522922071, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.9941, + "step": 2817 + }, + { + "epoch": 0.4778295888088173, + "grad_norm": 0.9010910112786696, + "learning_rate": 1.1199914371190379e-05, + "loss": 0.9751, + "step": 2818 + }, + { + "epoch": 0.47799915218312844, + "grad_norm": 0.9765388123043496, + "learning_rate": 1.1194461577869733e-05, + "loss": 0.9983, + "step": 2819 + }, + { + "epoch": 0.4781687155574396, + "grad_norm": 0.9825045537887909, + "learning_rate": 1.1189008424236148e-05, + "loss": 0.9557, + "step": 2820 + }, + { + "epoch": 0.47833827893175074, + "grad_norm": 0.9763148696662606, + "learning_rate": 1.1183554911934578e-05, + "loss": 0.95, + "step": 2821 + }, + { + "epoch": 0.4785078423060619, + "grad_norm": 0.9840187905158391, + "learning_rate": 1.1178101042610097e-05, + "loss": 0.9456, + "step": 2822 + }, + { + "epoch": 0.47867740568037304, + "grad_norm": 0.6720637902574569, + "learning_rate": 1.117264681790788e-05, + "loss": 0.8293, + "step": 2823 + }, + { + "epoch": 0.4788469690546842, + "grad_norm": 0.9912807047773171, + "learning_rate": 1.1167192239473211e-05, + "loss": 0.9666, + "step": 2824 + }, + { + "epoch": 0.47901653242899533, + "grad_norm": 1.083525729628238, + "learning_rate": 1.1161737308951473e-05, + "loss": 1.009, + "step": 2825 + }, + { + "epoch": 0.4791860958033065, + "grad_norm": 0.9108461908021732, + "learning_rate": 1.115628202798817e-05, + "loss": 0.971, + "step": 2826 + }, + { + "epoch": 0.47935565917761763, + "grad_norm": 0.9810903159716253, + "learning_rate": 1.1150826398228904e-05, + "loss": 0.9537, + "step": 2827 + }, + { + "epoch": 0.4795252225519288, + "grad_norm": 0.6493094241007337, + "learning_rate": 1.1145370421319377e-05, + "loss": 0.835, + "step": 2828 + }, + { + "epoch": 0.47969478592623993, + "grad_norm": 0.9631910051508563, + "learning_rate": 1.1139914098905406e-05, + "loss": 1.0077, + "step": 2829 + }, + { + "epoch": 0.4798643493005511, + "grad_norm": 0.9663254668587328, + "learning_rate": 1.1134457432632905e-05, + "loss": 0.9961, + "step": 2830 + }, + { + "epoch": 0.48003391267486223, + "grad_norm": 1.0147527220682437, + "learning_rate": 1.1129000424147896e-05, + "loss": 0.993, + "step": 2831 + }, + { + "epoch": 0.4802034760491734, + "grad_norm": 0.9740887378030193, + "learning_rate": 1.1123543075096498e-05, + "loss": 0.9742, + "step": 2832 + }, + { + "epoch": 0.4803730394234845, + "grad_norm": 0.9685721659669371, + "learning_rate": 1.111808538712494e-05, + "loss": 0.9679, + "step": 2833 + }, + { + "epoch": 0.4805426027977957, + "grad_norm": 0.9444620189900742, + "learning_rate": 1.1112627361879546e-05, + "loss": 0.9523, + "step": 2834 + }, + { + "epoch": 0.4807121661721068, + "grad_norm": 0.9669478340579718, + "learning_rate": 1.110716900100675e-05, + "loss": 1.0119, + "step": 2835 + }, + { + "epoch": 0.48088172954641795, + "grad_norm": 0.9666713557390871, + "learning_rate": 1.1101710306153084e-05, + "loss": 0.9597, + "step": 2836 + }, + { + "epoch": 0.4810512929207291, + "grad_norm": 0.973445031849053, + "learning_rate": 1.1096251278965173e-05, + "loss": 0.9729, + "step": 2837 + }, + { + "epoch": 0.48122085629504024, + "grad_norm": 0.99955165221053, + "learning_rate": 1.1090791921089759e-05, + "loss": 0.981, + "step": 2838 + }, + { + "epoch": 0.4813904196693514, + "grad_norm": 0.9893899567080178, + "learning_rate": 1.1085332234173664e-05, + "loss": 0.9802, + "step": 2839 + }, + { + "epoch": 0.48155998304366254, + "grad_norm": 0.9708006838502847, + "learning_rate": 1.1079872219863826e-05, + "loss": 0.9598, + "step": 2840 + }, + { + "epoch": 0.4817295464179737, + "grad_norm": 0.9813826122148571, + "learning_rate": 1.1074411879807271e-05, + "loss": 0.9692, + "step": 2841 + }, + { + "epoch": 0.48189910979228484, + "grad_norm": 0.9716444981902977, + "learning_rate": 1.1068951215651132e-05, + "loss": 0.9924, + "step": 2842 + }, + { + "epoch": 0.482068673166596, + "grad_norm": 0.9363142527072663, + "learning_rate": 1.1063490229042626e-05, + "loss": 0.9493, + "step": 2843 + }, + { + "epoch": 0.48223823654090714, + "grad_norm": 0.9660608671676179, + "learning_rate": 1.105802892162908e-05, + "loss": 0.9795, + "step": 2844 + }, + { + "epoch": 0.4824077999152183, + "grad_norm": 0.9737525049145669, + "learning_rate": 1.1052567295057921e-05, + "loss": 0.9487, + "step": 2845 + }, + { + "epoch": 0.48257736328952944, + "grad_norm": 1.0007023052805744, + "learning_rate": 1.1047105350976655e-05, + "loss": 0.9436, + "step": 2846 + }, + { + "epoch": 0.4827469266638406, + "grad_norm": 0.9313339199602495, + "learning_rate": 1.1041643091032901e-05, + "loss": 0.9658, + "step": 2847 + }, + { + "epoch": 0.48291649003815174, + "grad_norm": 0.9436495247658401, + "learning_rate": 1.103618051687436e-05, + "loss": 0.9771, + "step": 2848 + }, + { + "epoch": 0.4830860534124629, + "grad_norm": 0.9752547055391775, + "learning_rate": 1.1030717630148839e-05, + "loss": 0.9547, + "step": 2849 + }, + { + "epoch": 0.48325561678677403, + "grad_norm": 0.9942573743501482, + "learning_rate": 1.1025254432504234e-05, + "loss": 0.9999, + "step": 2850 + }, + { + "epoch": 0.4834251801610852, + "grad_norm": 0.9939091504771833, + "learning_rate": 1.1019790925588535e-05, + "loss": 0.9777, + "step": 2851 + }, + { + "epoch": 0.48359474353539633, + "grad_norm": 0.9905633325054976, + "learning_rate": 1.1014327111049819e-05, + "loss": 1.0093, + "step": 2852 + }, + { + "epoch": 0.4837643069097075, + "grad_norm": 0.9552233625500757, + "learning_rate": 1.1008862990536268e-05, + "loss": 0.9624, + "step": 2853 + }, + { + "epoch": 0.48393387028401863, + "grad_norm": 0.9781441331275021, + "learning_rate": 1.1003398565696153e-05, + "loss": 0.9922, + "step": 2854 + }, + { + "epoch": 0.4841034336583298, + "grad_norm": 0.9551795529916132, + "learning_rate": 1.0997933838177828e-05, + "loss": 0.947, + "step": 2855 + }, + { + "epoch": 0.48427299703264093, + "grad_norm": 0.9353122329094952, + "learning_rate": 1.0992468809629749e-05, + "loss": 0.9409, + "step": 2856 + }, + { + "epoch": 0.4844425604069521, + "grad_norm": 0.9800619040088573, + "learning_rate": 1.0987003481700456e-05, + "loss": 0.9717, + "step": 2857 + }, + { + "epoch": 0.4846121237812632, + "grad_norm": 0.9826245280042013, + "learning_rate": 1.0981537856038584e-05, + "loss": 0.9642, + "step": 2858 + }, + { + "epoch": 0.4847816871555744, + "grad_norm": 0.9468911680078637, + "learning_rate": 1.0976071934292854e-05, + "loss": 0.9127, + "step": 2859 + }, + { + "epoch": 0.4849512505298855, + "grad_norm": 0.979572897503583, + "learning_rate": 1.0970605718112078e-05, + "loss": 0.9969, + "step": 2860 + }, + { + "epoch": 0.4851208139041967, + "grad_norm": 0.9612814191018793, + "learning_rate": 1.0965139209145153e-05, + "loss": 0.9779, + "step": 2861 + }, + { + "epoch": 0.4852903772785078, + "grad_norm": 0.9745193314954537, + "learning_rate": 1.0959672409041073e-05, + "loss": 0.9389, + "step": 2862 + }, + { + "epoch": 0.485459940652819, + "grad_norm": 0.9464011194496259, + "learning_rate": 1.0954205319448914e-05, + "loss": 0.9738, + "step": 2863 + }, + { + "epoch": 0.4856295040271301, + "grad_norm": 0.999314092772429, + "learning_rate": 1.0948737942017838e-05, + "loss": 0.9955, + "step": 2864 + }, + { + "epoch": 0.4857990674014413, + "grad_norm": 0.929910362490679, + "learning_rate": 1.0943270278397097e-05, + "loss": 0.9672, + "step": 2865 + }, + { + "epoch": 0.4859686307757524, + "grad_norm": 0.9304272700292321, + "learning_rate": 1.093780233023603e-05, + "loss": 0.9364, + "step": 2866 + }, + { + "epoch": 0.4861381941500636, + "grad_norm": 0.9769774525299784, + "learning_rate": 1.0932334099184058e-05, + "loss": 0.9693, + "step": 2867 + }, + { + "epoch": 0.4863077575243747, + "grad_norm": 0.937351768846216, + "learning_rate": 1.0926865586890689e-05, + "loss": 0.9613, + "step": 2868 + }, + { + "epoch": 0.4864773208986859, + "grad_norm": 1.020892550525075, + "learning_rate": 1.0921396795005518e-05, + "loss": 0.9438, + "step": 2869 + }, + { + "epoch": 0.486646884272997, + "grad_norm": 0.9203304923703807, + "learning_rate": 1.0915927725178222e-05, + "loss": 0.933, + "step": 2870 + }, + { + "epoch": 0.4868164476473082, + "grad_norm": 1.0009506993998527, + "learning_rate": 1.0910458379058559e-05, + "loss": 0.9692, + "step": 2871 + }, + { + "epoch": 0.4869860110216193, + "grad_norm": 1.1315260145565647, + "learning_rate": 1.090498875829638e-05, + "loss": 0.9736, + "step": 2872 + }, + { + "epoch": 0.4871555743959305, + "grad_norm": 0.9414408573825512, + "learning_rate": 1.0899518864541607e-05, + "loss": 0.9798, + "step": 2873 + }, + { + "epoch": 0.4873251377702416, + "grad_norm": 1.0199135702472304, + "learning_rate": 1.0894048699444255e-05, + "loss": 0.9968, + "step": 2874 + }, + { + "epoch": 0.4874947011445528, + "grad_norm": 0.9900960349575106, + "learning_rate": 1.088857826465441e-05, + "loss": 0.9828, + "step": 2875 + }, + { + "epoch": 0.4876642645188639, + "grad_norm": 0.9424127123870729, + "learning_rate": 1.0883107561822253e-05, + "loss": 0.9436, + "step": 2876 + }, + { + "epoch": 0.4878338278931751, + "grad_norm": 0.9884269481190834, + "learning_rate": 1.087763659259803e-05, + "loss": 1.0084, + "step": 2877 + }, + { + "epoch": 0.4880033912674862, + "grad_norm": 0.9757803065886801, + "learning_rate": 1.0872165358632083e-05, + "loss": 0.9518, + "step": 2878 + }, + { + "epoch": 0.4881729546417974, + "grad_norm": 0.9247122174281511, + "learning_rate": 1.0866693861574817e-05, + "loss": 0.9634, + "step": 2879 + }, + { + "epoch": 0.4883425180161085, + "grad_norm": 0.9631759782867425, + "learning_rate": 1.0861222103076732e-05, + "loss": 0.97, + "step": 2880 + }, + { + "epoch": 0.4885120813904197, + "grad_norm": 0.9707089403060241, + "learning_rate": 1.08557500847884e-05, + "loss": 0.9758, + "step": 2881 + }, + { + "epoch": 0.4886816447647308, + "grad_norm": 1.0047634371169654, + "learning_rate": 1.0850277808360468e-05, + "loss": 1.0051, + "step": 2882 + }, + { + "epoch": 0.488851208139042, + "grad_norm": 0.9361267158586902, + "learning_rate": 1.0844805275443673e-05, + "loss": 0.9587, + "step": 2883 + }, + { + "epoch": 0.4890207715133531, + "grad_norm": 0.9633997905161528, + "learning_rate": 1.0839332487688812e-05, + "loss": 0.965, + "step": 2884 + }, + { + "epoch": 0.4891903348876643, + "grad_norm": 0.9195097840141965, + "learning_rate": 1.0833859446746773e-05, + "loss": 0.9397, + "step": 2885 + }, + { + "epoch": 0.4893598982619754, + "grad_norm": 0.9816701098115935, + "learning_rate": 1.0828386154268516e-05, + "loss": 0.9688, + "step": 2886 + }, + { + "epoch": 0.4895294616362866, + "grad_norm": 0.9636355363670693, + "learning_rate": 1.082291261190507e-05, + "loss": 0.9519, + "step": 2887 + }, + { + "epoch": 0.4896990250105977, + "grad_norm": 0.9796397963448259, + "learning_rate": 1.0817438821307554e-05, + "loss": 0.9784, + "step": 2888 + }, + { + "epoch": 0.4898685883849089, + "grad_norm": 0.9520858824977589, + "learning_rate": 1.0811964784127145e-05, + "loss": 0.955, + "step": 2889 + }, + { + "epoch": 0.49003815175922, + "grad_norm": 0.9953003622969355, + "learning_rate": 1.080649050201511e-05, + "loss": 0.9993, + "step": 2890 + }, + { + "epoch": 0.4902077151335312, + "grad_norm": 1.011621971328168, + "learning_rate": 1.0801015976622778e-05, + "loss": 1.0004, + "step": 2891 + }, + { + "epoch": 0.4903772785078423, + "grad_norm": 0.5966081013257017, + "learning_rate": 1.0795541209601561e-05, + "loss": 0.8133, + "step": 2892 + }, + { + "epoch": 0.4905468418821535, + "grad_norm": 1.035474794914511, + "learning_rate": 1.0790066202602931e-05, + "loss": 1.0254, + "step": 2893 + }, + { + "epoch": 0.4907164052564646, + "grad_norm": 0.9440224122938426, + "learning_rate": 1.0784590957278452e-05, + "loss": 0.9625, + "step": 2894 + }, + { + "epoch": 0.49088596863077577, + "grad_norm": 0.9204862833062184, + "learning_rate": 1.0779115475279737e-05, + "loss": 0.9646, + "step": 2895 + }, + { + "epoch": 0.4910555320050869, + "grad_norm": 0.9307037885604663, + "learning_rate": 1.0773639758258487e-05, + "loss": 0.9777, + "step": 2896 + }, + { + "epoch": 0.49122509537939807, + "grad_norm": 0.9771607096899959, + "learning_rate": 1.076816380786647e-05, + "loss": 0.9534, + "step": 2897 + }, + { + "epoch": 0.4913946587537092, + "grad_norm": 0.9773072368583425, + "learning_rate": 1.076268762575552e-05, + "loss": 0.9517, + "step": 2898 + }, + { + "epoch": 0.49156422212802037, + "grad_norm": 1.0006326363280977, + "learning_rate": 1.0757211213577543e-05, + "loss": 0.9766, + "step": 2899 + }, + { + "epoch": 0.4917337855023315, + "grad_norm": 1.0170669684583467, + "learning_rate": 1.0751734572984518e-05, + "loss": 1.0346, + "step": 2900 + }, + { + "epoch": 0.49190334887664267, + "grad_norm": 0.956575148792613, + "learning_rate": 1.0746257705628491e-05, + "loss": 1.0059, + "step": 2901 + }, + { + "epoch": 0.4920729122509538, + "grad_norm": 0.9652347200437185, + "learning_rate": 1.074078061316157e-05, + "loss": 0.9608, + "step": 2902 + }, + { + "epoch": 0.49224247562526496, + "grad_norm": 0.9676806835934749, + "learning_rate": 1.0735303297235944e-05, + "loss": 0.9694, + "step": 2903 + }, + { + "epoch": 0.4924120389995761, + "grad_norm": 1.0346639528568855, + "learning_rate": 1.0729825759503856e-05, + "loss": 0.9692, + "step": 2904 + }, + { + "epoch": 0.49258160237388726, + "grad_norm": 0.9180378119773038, + "learning_rate": 1.0724348001617626e-05, + "loss": 0.9512, + "step": 2905 + }, + { + "epoch": 0.4927511657481984, + "grad_norm": 0.6080972743140745, + "learning_rate": 1.0718870025229633e-05, + "loss": 0.7682, + "step": 2906 + }, + { + "epoch": 0.49292072912250956, + "grad_norm": 0.9809803690911169, + "learning_rate": 1.0713391831992324e-05, + "loss": 0.9424, + "step": 2907 + }, + { + "epoch": 0.4930902924968207, + "grad_norm": 0.9930391794344574, + "learning_rate": 1.0707913423558219e-05, + "loss": 0.9723, + "step": 2908 + }, + { + "epoch": 0.49325985587113186, + "grad_norm": 0.9337079932901288, + "learning_rate": 1.0702434801579888e-05, + "loss": 0.9423, + "step": 2909 + }, + { + "epoch": 0.493429419245443, + "grad_norm": 0.9933996119126327, + "learning_rate": 1.0696955967709982e-05, + "loss": 0.9716, + "step": 2910 + }, + { + "epoch": 0.49359898261975416, + "grad_norm": 0.9714083348549709, + "learning_rate": 1.06914769236012e-05, + "loss": 1.0411, + "step": 2911 + }, + { + "epoch": 0.4937685459940653, + "grad_norm": 0.9274656418017742, + "learning_rate": 1.0685997670906318e-05, + "loss": 1.01, + "step": 2912 + }, + { + "epoch": 0.49393810936837645, + "grad_norm": 0.9546216973869839, + "learning_rate": 1.0680518211278169e-05, + "loss": 0.986, + "step": 2913 + }, + { + "epoch": 0.4941076727426876, + "grad_norm": 1.0016634291536237, + "learning_rate": 1.0675038546369645e-05, + "loss": 0.9731, + "step": 2914 + }, + { + "epoch": 0.49427723611699875, + "grad_norm": 1.0161099341494713, + "learning_rate": 1.0669558677833707e-05, + "loss": 0.9344, + "step": 2915 + }, + { + "epoch": 0.4944467994913099, + "grad_norm": 0.9748366163279191, + "learning_rate": 1.0664078607323367e-05, + "loss": 0.9771, + "step": 2916 + }, + { + "epoch": 0.49461636286562105, + "grad_norm": 0.9657419435484357, + "learning_rate": 1.0658598336491715e-05, + "loss": 0.9602, + "step": 2917 + }, + { + "epoch": 0.4947859262399322, + "grad_norm": 0.9817849633090451, + "learning_rate": 1.0653117866991884e-05, + "loss": 0.9747, + "step": 2918 + }, + { + "epoch": 0.49495548961424335, + "grad_norm": 1.0085780343771815, + "learning_rate": 1.0647637200477077e-05, + "loss": 0.9944, + "step": 2919 + }, + { + "epoch": 0.49512505298855447, + "grad_norm": 0.9481176895888058, + "learning_rate": 1.064215633860055e-05, + "loss": 0.94, + "step": 2920 + }, + { + "epoch": 0.49529461636286565, + "grad_norm": 0.9312999622172784, + "learning_rate": 1.063667528301563e-05, + "loss": 0.9441, + "step": 2921 + }, + { + "epoch": 0.49546417973717677, + "grad_norm": 0.9799887727456387, + "learning_rate": 1.0631194035375685e-05, + "loss": 0.9724, + "step": 2922 + }, + { + "epoch": 0.49563374311148795, + "grad_norm": 0.6105575030169635, + "learning_rate": 1.0625712597334155e-05, + "loss": 0.7755, + "step": 2923 + }, + { + "epoch": 0.49580330648579907, + "grad_norm": 0.9719192882833773, + "learning_rate": 1.062023097054453e-05, + "loss": 0.9647, + "step": 2924 + }, + { + "epoch": 0.49597286986011024, + "grad_norm": 0.9750845610038188, + "learning_rate": 1.0614749156660357e-05, + "loss": 0.9411, + "step": 2925 + }, + { + "epoch": 0.49614243323442137, + "grad_norm": 0.9644441767801027, + "learning_rate": 1.060926715733525e-05, + "loss": 0.9411, + "step": 2926 + }, + { + "epoch": 0.4963119966087325, + "grad_norm": 0.9472630444940732, + "learning_rate": 1.0603784974222862e-05, + "loss": 0.9626, + "step": 2927 + }, + { + "epoch": 0.49648155998304366, + "grad_norm": 0.9291509705610462, + "learning_rate": 1.0598302608976914e-05, + "loss": 0.9438, + "step": 2928 + }, + { + "epoch": 0.4966511233573548, + "grad_norm": 0.9714718917681049, + "learning_rate": 1.0592820063251177e-05, + "loss": 0.9468, + "step": 2929 + }, + { + "epoch": 0.49682068673166596, + "grad_norm": 0.9837006935637556, + "learning_rate": 1.0587337338699479e-05, + "loss": 0.948, + "step": 2930 + }, + { + "epoch": 0.4969902501059771, + "grad_norm": 0.9689110435583219, + "learning_rate": 1.0581854436975699e-05, + "loss": 0.9977, + "step": 2931 + }, + { + "epoch": 0.49715981348028826, + "grad_norm": 0.9195019094075366, + "learning_rate": 1.057637135973377e-05, + "loss": 0.9479, + "step": 2932 + }, + { + "epoch": 0.4973293768545994, + "grad_norm": 0.6093520159809416, + "learning_rate": 1.0570888108627682e-05, + "loss": 0.7789, + "step": 2933 + }, + { + "epoch": 0.49749894022891056, + "grad_norm": 1.0026689203143766, + "learning_rate": 1.056540468531147e-05, + "loss": 0.9967, + "step": 2934 + }, + { + "epoch": 0.4976685036032217, + "grad_norm": 0.9163752125603127, + "learning_rate": 1.0559921091439229e-05, + "loss": 0.956, + "step": 2935 + }, + { + "epoch": 0.49783806697753286, + "grad_norm": 0.6210266475224575, + "learning_rate": 1.0554437328665099e-05, + "loss": 0.787, + "step": 2936 + }, + { + "epoch": 0.498007630351844, + "grad_norm": 0.9654389042764755, + "learning_rate": 1.0548953398643276e-05, + "loss": 0.9437, + "step": 2937 + }, + { + "epoch": 0.49817719372615515, + "grad_norm": 0.9106810632404411, + "learning_rate": 1.0543469303028002e-05, + "loss": 0.9808, + "step": 2938 + }, + { + "epoch": 0.4983467571004663, + "grad_norm": 0.9461338095565945, + "learning_rate": 1.0537985043473573e-05, + "loss": 0.9899, + "step": 2939 + }, + { + "epoch": 0.49851632047477745, + "grad_norm": 0.9675449794912596, + "learning_rate": 1.0532500621634327e-05, + "loss": 0.9658, + "step": 2940 + }, + { + "epoch": 0.4986858838490886, + "grad_norm": 0.9784008593520793, + "learning_rate": 1.0527016039164665e-05, + "loss": 0.9966, + "step": 2941 + }, + { + "epoch": 0.49885544722339975, + "grad_norm": 0.9205398819597493, + "learning_rate": 1.0521531297719024e-05, + "loss": 0.9579, + "step": 2942 + }, + { + "epoch": 0.49902501059771087, + "grad_norm": 1.0040242007668332, + "learning_rate": 1.051604639895189e-05, + "loss": 0.9877, + "step": 2943 + }, + { + "epoch": 0.49919457397202205, + "grad_norm": 0.9670113771094724, + "learning_rate": 1.0510561344517802e-05, + "loss": 1.0154, + "step": 2944 + }, + { + "epoch": 0.49936413734633317, + "grad_norm": 0.927835688301253, + "learning_rate": 1.0505076136071342e-05, + "loss": 0.9557, + "step": 2945 + }, + { + "epoch": 0.49953370072064435, + "grad_norm": 0.9485398891948655, + "learning_rate": 1.0499590775267142e-05, + "loss": 0.9839, + "step": 2946 + }, + { + "epoch": 0.49970326409495547, + "grad_norm": 0.962753380642974, + "learning_rate": 1.0494105263759873e-05, + "loss": 0.9735, + "step": 2947 + }, + { + "epoch": 0.49987282746926665, + "grad_norm": 0.9304507409126845, + "learning_rate": 1.0488619603204263e-05, + "loss": 0.9189, + "step": 2948 + }, + { + "epoch": 0.5000423908435778, + "grad_norm": 0.9364607690003749, + "learning_rate": 1.0483133795255072e-05, + "loss": 0.9933, + "step": 2949 + }, + { + "epoch": 0.5002119542178889, + "grad_norm": 0.8737630997134399, + "learning_rate": 1.0477647841567113e-05, + "loss": 0.9274, + "step": 2950 + }, + { + "epoch": 0.5003815175922001, + "grad_norm": 0.9478657756581341, + "learning_rate": 1.0472161743795245e-05, + "loss": 0.974, + "step": 2951 + }, + { + "epoch": 0.5005510809665112, + "grad_norm": 0.9985040575291176, + "learning_rate": 1.0466675503594354e-05, + "loss": 0.9949, + "step": 2952 + }, + { + "epoch": 0.5007206443408224, + "grad_norm": 0.9818331672238509, + "learning_rate": 1.0461189122619394e-05, + "loss": 1.0089, + "step": 2953 + }, + { + "epoch": 0.5008902077151335, + "grad_norm": 0.9568448695807307, + "learning_rate": 1.0455702602525338e-05, + "loss": 0.9678, + "step": 2954 + }, + { + "epoch": 0.5010597710894447, + "grad_norm": 1.0665268812904696, + "learning_rate": 1.045021594496722e-05, + "loss": 0.943, + "step": 2955 + }, + { + "epoch": 0.5012293344637558, + "grad_norm": 0.9405145666754654, + "learning_rate": 1.04447291516001e-05, + "loss": 0.9429, + "step": 2956 + }, + { + "epoch": 0.501398897838067, + "grad_norm": 0.9219737585338114, + "learning_rate": 1.043924222407909e-05, + "loss": 0.937, + "step": 2957 + }, + { + "epoch": 0.5015684612123781, + "grad_norm": 0.9655345972241506, + "learning_rate": 1.0433755164059333e-05, + "loss": 0.96, + "step": 2958 + }, + { + "epoch": 0.5017380245866893, + "grad_norm": 0.9956097332699669, + "learning_rate": 1.0428267973196027e-05, + "loss": 0.9983, + "step": 2959 + }, + { + "epoch": 0.5019075879610004, + "grad_norm": 1.0435488594291606, + "learning_rate": 1.0422780653144392e-05, + "loss": 0.9839, + "step": 2960 + }, + { + "epoch": 0.5020771513353116, + "grad_norm": 1.0663192009309412, + "learning_rate": 1.0417293205559694e-05, + "loss": 0.9953, + "step": 2961 + }, + { + "epoch": 0.5022467147096227, + "grad_norm": 0.9761557822826532, + "learning_rate": 1.0411805632097242e-05, + "loss": 0.9361, + "step": 2962 + }, + { + "epoch": 0.5024162780839339, + "grad_norm": 0.9580962100611554, + "learning_rate": 1.0406317934412375e-05, + "loss": 0.9801, + "step": 2963 + }, + { + "epoch": 0.502585841458245, + "grad_norm": 0.9249506342575414, + "learning_rate": 1.040083011416048e-05, + "loss": 0.9378, + "step": 2964 + }, + { + "epoch": 0.5027554048325562, + "grad_norm": 0.9969984048374418, + "learning_rate": 1.0395342172996969e-05, + "loss": 1.0112, + "step": 2965 + }, + { + "epoch": 0.5029249682068673, + "grad_norm": 0.9965147107649625, + "learning_rate": 1.03898541125773e-05, + "loss": 0.9719, + "step": 2966 + }, + { + "epoch": 0.5030945315811785, + "grad_norm": 0.9677148166378071, + "learning_rate": 1.0384365934556958e-05, + "loss": 0.9577, + "step": 2967 + }, + { + "epoch": 0.5032640949554896, + "grad_norm": 0.9338204116639005, + "learning_rate": 1.0378877640591474e-05, + "loss": 0.9552, + "step": 2968 + }, + { + "epoch": 0.5034336583298008, + "grad_norm": 0.9300671492275149, + "learning_rate": 1.0373389232336404e-05, + "loss": 0.9503, + "step": 2969 + }, + { + "epoch": 0.5036032217041119, + "grad_norm": 0.9672889622426338, + "learning_rate": 1.0367900711447343e-05, + "loss": 0.9743, + "step": 2970 + }, + { + "epoch": 0.503772785078423, + "grad_norm": 0.9626616731845161, + "learning_rate": 1.0362412079579925e-05, + "loss": 0.9695, + "step": 2971 + }, + { + "epoch": 0.5039423484527342, + "grad_norm": 0.9519564028321347, + "learning_rate": 1.0356923338389807e-05, + "loss": 0.969, + "step": 2972 + }, + { + "epoch": 0.5041119118270454, + "grad_norm": 1.006600855018696, + "learning_rate": 1.0351434489532685e-05, + "loss": 0.9701, + "step": 2973 + }, + { + "epoch": 0.5042814752013565, + "grad_norm": 1.0055354085184096, + "learning_rate": 1.034594553466429e-05, + "loss": 0.997, + "step": 2974 + }, + { + "epoch": 0.5044510385756676, + "grad_norm": 0.922806722120795, + "learning_rate": 1.034045647544038e-05, + "loss": 0.9623, + "step": 2975 + }, + { + "epoch": 0.5046206019499788, + "grad_norm": 0.99415815482347, + "learning_rate": 1.0334967313516743e-05, + "loss": 0.97, + "step": 2976 + }, + { + "epoch": 0.50479016532429, + "grad_norm": 0.9314870741543382, + "learning_rate": 1.0329478050549208e-05, + "loss": 0.9412, + "step": 2977 + }, + { + "epoch": 0.5049597286986011, + "grad_norm": 0.9944239460890043, + "learning_rate": 1.0323988688193624e-05, + "loss": 0.9797, + "step": 2978 + }, + { + "epoch": 0.5051292920729122, + "grad_norm": 0.9552757776029114, + "learning_rate": 1.0318499228105875e-05, + "loss": 0.9669, + "step": 2979 + }, + { + "epoch": 0.5052988554472234, + "grad_norm": 0.9371255377083604, + "learning_rate": 1.031300967194187e-05, + "loss": 0.9825, + "step": 2980 + }, + { + "epoch": 0.5054684188215346, + "grad_norm": 0.9754954962516367, + "learning_rate": 1.0307520021357552e-05, + "loss": 0.9697, + "step": 2981 + }, + { + "epoch": 0.5056379821958457, + "grad_norm": 0.9679627443272669, + "learning_rate": 1.030203027800889e-05, + "loss": 0.9528, + "step": 2982 + }, + { + "epoch": 0.5058075455701568, + "grad_norm": 0.9701192527386252, + "learning_rate": 1.0296540443551884e-05, + "loss": 0.9409, + "step": 2983 + }, + { + "epoch": 0.505977108944468, + "grad_norm": 0.9761307665938758, + "learning_rate": 1.0291050519642559e-05, + "loss": 0.9688, + "step": 2984 + }, + { + "epoch": 0.5061466723187792, + "grad_norm": 1.0018002448076804, + "learning_rate": 1.0285560507936962e-05, + "loss": 0.9765, + "step": 2985 + }, + { + "epoch": 0.5063162356930903, + "grad_norm": 0.9741453527839871, + "learning_rate": 1.028007041009118e-05, + "loss": 0.9512, + "step": 2986 + }, + { + "epoch": 0.5064857990674014, + "grad_norm": 0.9822155127050368, + "learning_rate": 1.0274580227761313e-05, + "loss": 0.9944, + "step": 2987 + }, + { + "epoch": 0.5066553624417126, + "grad_norm": 0.9607327292511342, + "learning_rate": 1.026908996260349e-05, + "loss": 0.9784, + "step": 2988 + }, + { + "epoch": 0.5068249258160238, + "grad_norm": 0.9413785529013019, + "learning_rate": 1.0263599616273868e-05, + "loss": 0.9576, + "step": 2989 + }, + { + "epoch": 0.5069944891903349, + "grad_norm": 0.9882109514827956, + "learning_rate": 1.0258109190428623e-05, + "loss": 0.9583, + "step": 2990 + }, + { + "epoch": 0.507164052564646, + "grad_norm": 0.9505835701708236, + "learning_rate": 1.0252618686723963e-05, + "loss": 0.9955, + "step": 2991 + }, + { + "epoch": 0.5073336159389572, + "grad_norm": 0.9556454802454527, + "learning_rate": 1.0247128106816113e-05, + "loss": 0.9907, + "step": 2992 + }, + { + "epoch": 0.5075031793132684, + "grad_norm": 1.0174589943243582, + "learning_rate": 1.0241637452361323e-05, + "loss": 0.9463, + "step": 2993 + }, + { + "epoch": 0.5076727426875794, + "grad_norm": 0.9729391181219331, + "learning_rate": 1.0236146725015867e-05, + "loss": 0.9571, + "step": 2994 + }, + { + "epoch": 0.5078423060618906, + "grad_norm": 1.0148644473564856, + "learning_rate": 1.0230655926436037e-05, + "loss": 0.9568, + "step": 2995 + }, + { + "epoch": 0.5080118694362018, + "grad_norm": 0.6777862321081547, + "learning_rate": 1.0225165058278153e-05, + "loss": 0.829, + "step": 2996 + }, + { + "epoch": 0.508181432810513, + "grad_norm": 1.012201917788918, + "learning_rate": 1.0219674122198548e-05, + "loss": 0.9768, + "step": 2997 + }, + { + "epoch": 0.508350996184824, + "grad_norm": 1.0324866649739812, + "learning_rate": 1.0214183119853583e-05, + "loss": 0.9974, + "step": 2998 + }, + { + "epoch": 0.5085205595591352, + "grad_norm": 0.9824154571991718, + "learning_rate": 1.020869205289963e-05, + "loss": 0.9232, + "step": 2999 + }, + { + "epoch": 0.5086901229334464, + "grad_norm": 0.9511751284352469, + "learning_rate": 1.020320092299309e-05, + "loss": 0.9276, + "step": 3000 + }, + { + "epoch": 0.5088596863077576, + "grad_norm": 0.9705014870712267, + "learning_rate": 1.0197709731790375e-05, + "loss": 0.9753, + "step": 3001 + }, + { + "epoch": 0.5090292496820686, + "grad_norm": 1.0223408137316228, + "learning_rate": 1.0192218480947924e-05, + "loss": 0.9941, + "step": 3002 + }, + { + "epoch": 0.5091988130563798, + "grad_norm": 0.9958209748678288, + "learning_rate": 1.018672717212219e-05, + "loss": 0.9614, + "step": 3003 + }, + { + "epoch": 0.509368376430691, + "grad_norm": 1.0067427709667378, + "learning_rate": 1.018123580696964e-05, + "loss": 0.9625, + "step": 3004 + }, + { + "epoch": 0.5095379398050022, + "grad_norm": 1.0185727236792934, + "learning_rate": 1.0175744387146763e-05, + "loss": 0.9692, + "step": 3005 + }, + { + "epoch": 0.5097075031793132, + "grad_norm": 0.897905468371671, + "learning_rate": 1.0170252914310059e-05, + "loss": 0.8972, + "step": 3006 + }, + { + "epoch": 0.5098770665536244, + "grad_norm": 1.0066454521929236, + "learning_rate": 1.0164761390116052e-05, + "loss": 0.999, + "step": 3007 + }, + { + "epoch": 0.5100466299279356, + "grad_norm": 0.9780826297195727, + "learning_rate": 1.015926981622127e-05, + "loss": 0.9873, + "step": 3008 + }, + { + "epoch": 0.5102161933022468, + "grad_norm": 0.9340689904195036, + "learning_rate": 1.0153778194282269e-05, + "loss": 0.9558, + "step": 3009 + }, + { + "epoch": 0.5103857566765578, + "grad_norm": 0.9519678555378752, + "learning_rate": 1.014828652595561e-05, + "loss": 0.9472, + "step": 3010 + }, + { + "epoch": 0.510555320050869, + "grad_norm": 0.940992409153868, + "learning_rate": 1.0142794812897874e-05, + "loss": 0.9987, + "step": 3011 + }, + { + "epoch": 0.5107248834251802, + "grad_norm": 1.0130972467903798, + "learning_rate": 1.0137303056765651e-05, + "loss": 0.9819, + "step": 3012 + }, + { + "epoch": 0.5108944467994914, + "grad_norm": 0.9324760322202679, + "learning_rate": 1.0131811259215545e-05, + "loss": 0.9271, + "step": 3013 + }, + { + "epoch": 0.5110640101738024, + "grad_norm": 0.915884183897883, + "learning_rate": 1.0126319421904179e-05, + "loss": 0.9622, + "step": 3014 + }, + { + "epoch": 0.5112335735481136, + "grad_norm": 0.9391234432530088, + "learning_rate": 1.0120827546488175e-05, + "loss": 0.9447, + "step": 3015 + }, + { + "epoch": 0.5114031369224248, + "grad_norm": 0.985767603698776, + "learning_rate": 1.011533563462418e-05, + "loss": 0.9421, + "step": 3016 + }, + { + "epoch": 0.511572700296736, + "grad_norm": 0.9876067689540599, + "learning_rate": 1.0109843687968837e-05, + "loss": 0.9819, + "step": 3017 + }, + { + "epoch": 0.511742263671047, + "grad_norm": 0.9131684987717541, + "learning_rate": 1.0104351708178816e-05, + "loss": 0.9064, + "step": 3018 + }, + { + "epoch": 0.5119118270453582, + "grad_norm": 1.0683342866122005, + "learning_rate": 1.0098859696910788e-05, + "loss": 1.0151, + "step": 3019 + }, + { + "epoch": 0.5120813904196694, + "grad_norm": 0.9622501703699717, + "learning_rate": 1.0093367655821433e-05, + "loss": 0.9712, + "step": 3020 + }, + { + "epoch": 0.5122509537939804, + "grad_norm": 1.0675165805273066, + "learning_rate": 1.0087875586567443e-05, + "loss": 0.9321, + "step": 3021 + }, + { + "epoch": 0.5124205171682916, + "grad_norm": 0.9544672129882473, + "learning_rate": 1.0082383490805517e-05, + "loss": 0.9858, + "step": 3022 + }, + { + "epoch": 0.5125900805426028, + "grad_norm": 0.9210008923203861, + "learning_rate": 1.0076891370192364e-05, + "loss": 0.9839, + "step": 3023 + }, + { + "epoch": 0.512759643916914, + "grad_norm": 0.9236018487123309, + "learning_rate": 1.0071399226384695e-05, + "loss": 0.9536, + "step": 3024 + }, + { + "epoch": 0.512929207291225, + "grad_norm": 0.6141193255011235, + "learning_rate": 1.0065907061039234e-05, + "loss": 0.8117, + "step": 3025 + }, + { + "epoch": 0.5130987706655362, + "grad_norm": 0.9973432664803016, + "learning_rate": 1.0060414875812709e-05, + "loss": 0.9689, + "step": 3026 + }, + { + "epoch": 0.5132683340398474, + "grad_norm": 0.899647197533492, + "learning_rate": 1.0054922672361858e-05, + "loss": 0.9385, + "step": 3027 + }, + { + "epoch": 0.5134378974141586, + "grad_norm": 0.9680158077330153, + "learning_rate": 1.0049430452343412e-05, + "loss": 0.9833, + "step": 3028 + }, + { + "epoch": 0.5136074607884696, + "grad_norm": 0.9419715364760196, + "learning_rate": 1.004393821741412e-05, + "loss": 0.9832, + "step": 3029 + }, + { + "epoch": 0.5137770241627808, + "grad_norm": 1.0063487453379598, + "learning_rate": 1.0038445969230737e-05, + "loss": 0.985, + "step": 3030 + }, + { + "epoch": 0.513946587537092, + "grad_norm": 0.9910409081383873, + "learning_rate": 1.0032953709450006e-05, + "loss": 0.9566, + "step": 3031 + }, + { + "epoch": 0.5141161509114032, + "grad_norm": 0.9861444723050511, + "learning_rate": 1.0027461439728695e-05, + "loss": 0.987, + "step": 3032 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 0.9878644367724077, + "learning_rate": 1.0021969161723555e-05, + "loss": 0.9772, + "step": 3033 + }, + { + "epoch": 0.5144552776600254, + "grad_norm": 0.9995895925753583, + "learning_rate": 1.001647687709135e-05, + "loss": 0.9817, + "step": 3034 + }, + { + "epoch": 0.5146248410343366, + "grad_norm": 0.6285121486823686, + "learning_rate": 1.0010984587488846e-05, + "loss": 0.8288, + "step": 3035 + }, + { + "epoch": 0.5147944044086478, + "grad_norm": 0.6566730797848159, + "learning_rate": 1.0005492294572806e-05, + "loss": 0.8014, + "step": 3036 + }, + { + "epoch": 0.5149639677829588, + "grad_norm": 0.968587622505413, + "learning_rate": 1e-05, + "loss": 0.9096, + "step": 3037 + }, + { + "epoch": 0.51513353115727, + "grad_norm": 0.9887804687730919, + "learning_rate": 9.994507705427197e-06, + "loss": 1.0124, + "step": 3038 + }, + { + "epoch": 0.5153030945315812, + "grad_norm": 0.9916987462829414, + "learning_rate": 9.989015412511156e-06, + "loss": 0.9819, + "step": 3039 + }, + { + "epoch": 0.5154726579058924, + "grad_norm": 0.9093402823364884, + "learning_rate": 9.983523122908653e-06, + "loss": 0.974, + "step": 3040 + }, + { + "epoch": 0.5156422212802034, + "grad_norm": 0.9176245520247509, + "learning_rate": 9.97803083827645e-06, + "loss": 0.8978, + "step": 3041 + }, + { + "epoch": 0.5158117846545146, + "grad_norm": 0.9319954736686507, + "learning_rate": 9.97253856027131e-06, + "loss": 0.9518, + "step": 3042 + }, + { + "epoch": 0.5159813480288258, + "grad_norm": 0.9020564377447081, + "learning_rate": 9.967046290549994e-06, + "loss": 0.9325, + "step": 3043 + }, + { + "epoch": 0.516150911403137, + "grad_norm": 0.9482998731154629, + "learning_rate": 9.961554030769267e-06, + "loss": 0.9464, + "step": 3044 + }, + { + "epoch": 0.516320474777448, + "grad_norm": 1.0154247648085624, + "learning_rate": 9.956061782585882e-06, + "loss": 0.985, + "step": 3045 + }, + { + "epoch": 0.5164900381517592, + "grad_norm": 0.9212286029238748, + "learning_rate": 9.95056954765659e-06, + "loss": 0.96, + "step": 3046 + }, + { + "epoch": 0.5166596015260704, + "grad_norm": 1.0056753686145739, + "learning_rate": 9.945077327638146e-06, + "loss": 0.9511, + "step": 3047 + }, + { + "epoch": 0.5168291649003816, + "grad_norm": 0.9487740091666449, + "learning_rate": 9.939585124187293e-06, + "loss": 0.9773, + "step": 3048 + }, + { + "epoch": 0.5169987282746926, + "grad_norm": 0.9810533182193518, + "learning_rate": 9.934092938960769e-06, + "loss": 0.9597, + "step": 3049 + }, + { + "epoch": 0.5171682916490038, + "grad_norm": 0.917200931335745, + "learning_rate": 9.928600773615306e-06, + "loss": 0.9067, + "step": 3050 + }, + { + "epoch": 0.517337855023315, + "grad_norm": 1.0390656651226249, + "learning_rate": 9.92310862980764e-06, + "loss": 0.977, + "step": 3051 + }, + { + "epoch": 0.5175074183976262, + "grad_norm": 0.9366534736998925, + "learning_rate": 9.917616509194487e-06, + "loss": 0.9975, + "step": 3052 + }, + { + "epoch": 0.5176769817719372, + "grad_norm": 0.9627983455446224, + "learning_rate": 9.912124413432562e-06, + "loss": 0.9761, + "step": 3053 + }, + { + "epoch": 0.5178465451462484, + "grad_norm": 0.97130746211671, + "learning_rate": 9.906632344178569e-06, + "loss": 0.9461, + "step": 3054 + }, + { + "epoch": 0.5180161085205596, + "grad_norm": 1.0307864298374014, + "learning_rate": 9.901140303089216e-06, + "loss": 0.9703, + "step": 3055 + }, + { + "epoch": 0.5181856718948707, + "grad_norm": 0.9653993044907914, + "learning_rate": 9.895648291821187e-06, + "loss": 0.9681, + "step": 3056 + }, + { + "epoch": 0.5183552352691818, + "grad_norm": 0.9758498269771244, + "learning_rate": 9.890156312031165e-06, + "loss": 0.971, + "step": 3057 + }, + { + "epoch": 0.518524798643493, + "grad_norm": 0.9473653063921438, + "learning_rate": 9.884664365375824e-06, + "loss": 0.9607, + "step": 3058 + }, + { + "epoch": 0.5186943620178042, + "grad_norm": 1.0243254509080115, + "learning_rate": 9.879172453511827e-06, + "loss": 0.9841, + "step": 3059 + }, + { + "epoch": 0.5188639253921153, + "grad_norm": 0.984989518683435, + "learning_rate": 9.873680578095824e-06, + "loss": 0.9637, + "step": 3060 + }, + { + "epoch": 0.5190334887664264, + "grad_norm": 0.9573117848390525, + "learning_rate": 9.868188740784455e-06, + "loss": 0.9807, + "step": 3061 + }, + { + "epoch": 0.5192030521407376, + "grad_norm": 0.9059562497684082, + "learning_rate": 9.86269694323435e-06, + "loss": 0.929, + "step": 3062 + }, + { + "epoch": 0.5193726155150488, + "grad_norm": 0.9502470282554143, + "learning_rate": 9.85720518710213e-06, + "loss": 0.9968, + "step": 3063 + }, + { + "epoch": 0.5195421788893599, + "grad_norm": 0.9657851638421482, + "learning_rate": 9.851713474044391e-06, + "loss": 0.9568, + "step": 3064 + }, + { + "epoch": 0.519711742263671, + "grad_norm": 0.9672356013317819, + "learning_rate": 9.846221805717734e-06, + "loss": 0.9749, + "step": 3065 + }, + { + "epoch": 0.5198813056379822, + "grad_norm": 0.9340912466841508, + "learning_rate": 9.840730183778734e-06, + "loss": 0.9653, + "step": 3066 + }, + { + "epoch": 0.5200508690122934, + "grad_norm": 0.9572911428001519, + "learning_rate": 9.835238609883955e-06, + "loss": 0.9801, + "step": 3067 + }, + { + "epoch": 0.5202204323866045, + "grad_norm": 0.9462924610134427, + "learning_rate": 9.829747085689943e-06, + "loss": 0.9759, + "step": 3068 + }, + { + "epoch": 0.5203899957609156, + "grad_norm": 0.9172413013391196, + "learning_rate": 9.82425561285324e-06, + "loss": 0.938, + "step": 3069 + }, + { + "epoch": 0.5205595591352268, + "grad_norm": 0.9447327106113326, + "learning_rate": 9.818764193030364e-06, + "loss": 0.9517, + "step": 3070 + }, + { + "epoch": 0.520729122509538, + "grad_norm": 0.9852280378377472, + "learning_rate": 9.813272827877814e-06, + "loss": 0.9483, + "step": 3071 + }, + { + "epoch": 0.5208986858838491, + "grad_norm": 0.9251579287224695, + "learning_rate": 9.807781519052075e-06, + "loss": 0.9465, + "step": 3072 + }, + { + "epoch": 0.5210682492581602, + "grad_norm": 0.944118435301543, + "learning_rate": 9.802290268209627e-06, + "loss": 0.9269, + "step": 3073 + }, + { + "epoch": 0.5212378126324714, + "grad_norm": 0.9019616032939692, + "learning_rate": 9.796799077006916e-06, + "loss": 0.9369, + "step": 3074 + }, + { + "epoch": 0.5214073760067826, + "grad_norm": 0.9650637926858207, + "learning_rate": 9.79130794710037e-06, + "loss": 0.9641, + "step": 3075 + }, + { + "epoch": 0.5215769393810937, + "grad_norm": 0.653121613874621, + "learning_rate": 9.78581688014642e-06, + "loss": 0.7765, + "step": 3076 + }, + { + "epoch": 0.5217465027554048, + "grad_norm": 0.9175571756893963, + "learning_rate": 9.780325877801455e-06, + "loss": 0.9065, + "step": 3077 + }, + { + "epoch": 0.521916066129716, + "grad_norm": 0.9436051084979671, + "learning_rate": 9.77483494172185e-06, + "loss": 0.9962, + "step": 3078 + }, + { + "epoch": 0.5220856295040271, + "grad_norm": 0.9729981868750707, + "learning_rate": 9.769344073563963e-06, + "loss": 0.9223, + "step": 3079 + }, + { + "epoch": 0.5222551928783383, + "grad_norm": 1.0092633368268316, + "learning_rate": 9.763853274984135e-06, + "loss": 0.9581, + "step": 3080 + }, + { + "epoch": 0.5224247562526494, + "grad_norm": 0.9778963346359691, + "learning_rate": 9.75836254763868e-06, + "loss": 0.949, + "step": 3081 + }, + { + "epoch": 0.5225943196269606, + "grad_norm": 0.912624606806099, + "learning_rate": 9.752871893183886e-06, + "loss": 0.9711, + "step": 3082 + }, + { + "epoch": 0.5227638830012717, + "grad_norm": 0.92029296626889, + "learning_rate": 9.747381313276039e-06, + "loss": 0.9111, + "step": 3083 + }, + { + "epoch": 0.5229334463755829, + "grad_norm": 0.9320979373138509, + "learning_rate": 9.74189080957138e-06, + "loss": 0.9464, + "step": 3084 + }, + { + "epoch": 0.523103009749894, + "grad_norm": 0.9703105138491713, + "learning_rate": 9.736400383726138e-06, + "loss": 0.9342, + "step": 3085 + }, + { + "epoch": 0.5232725731242052, + "grad_norm": 0.9245762093095746, + "learning_rate": 9.730910037396514e-06, + "loss": 0.9452, + "step": 3086 + }, + { + "epoch": 0.5234421364985163, + "grad_norm": 1.1005781231672964, + "learning_rate": 9.725419772238688e-06, + "loss": 0.9697, + "step": 3087 + }, + { + "epoch": 0.5236116998728275, + "grad_norm": 0.9488140372655249, + "learning_rate": 9.719929589908823e-06, + "loss": 0.9531, + "step": 3088 + }, + { + "epoch": 0.5237812632471386, + "grad_norm": 0.929861906985759, + "learning_rate": 9.71443949206304e-06, + "loss": 0.961, + "step": 3089 + }, + { + "epoch": 0.5239508266214498, + "grad_norm": 0.9505780217453594, + "learning_rate": 9.708949480357445e-06, + "loss": 0.9577, + "step": 3090 + }, + { + "epoch": 0.5241203899957609, + "grad_norm": 0.977896863661659, + "learning_rate": 9.703459556448119e-06, + "loss": 0.9972, + "step": 3091 + }, + { + "epoch": 0.5242899533700721, + "grad_norm": 0.598807210167383, + "learning_rate": 9.697969721991114e-06, + "loss": 0.7694, + "step": 3092 + }, + { + "epoch": 0.5244595167443832, + "grad_norm": 0.9599633674631082, + "learning_rate": 9.69247997864245e-06, + "loss": 0.9581, + "step": 3093 + }, + { + "epoch": 0.5246290801186944, + "grad_norm": 0.9599831766324823, + "learning_rate": 9.686990328058134e-06, + "loss": 0.9313, + "step": 3094 + }, + { + "epoch": 0.5247986434930055, + "grad_norm": 0.9777655923261855, + "learning_rate": 9.68150077189413e-06, + "loss": 0.956, + "step": 3095 + }, + { + "epoch": 0.5249682068673167, + "grad_norm": 1.0112682590547484, + "learning_rate": 9.67601131180638e-06, + "loss": 0.9849, + "step": 3096 + }, + { + "epoch": 0.5251377702416278, + "grad_norm": 0.9246260336428133, + "learning_rate": 9.670521949450793e-06, + "loss": 0.9258, + "step": 3097 + }, + { + "epoch": 0.525307333615939, + "grad_norm": 0.9254524304974351, + "learning_rate": 9.665032686483259e-06, + "loss": 0.9731, + "step": 3098 + }, + { + "epoch": 0.5254768969902501, + "grad_norm": 0.9528109123354909, + "learning_rate": 9.659543524559626e-06, + "loss": 0.9428, + "step": 3099 + }, + { + "epoch": 0.5256464603645613, + "grad_norm": 0.9479190221075431, + "learning_rate": 9.654054465335712e-06, + "loss": 0.9291, + "step": 3100 + }, + { + "epoch": 0.5258160237388724, + "grad_norm": 0.9632111521628236, + "learning_rate": 9.648565510467316e-06, + "loss": 0.9795, + "step": 3101 + }, + { + "epoch": 0.5259855871131836, + "grad_norm": 0.9673157350605677, + "learning_rate": 9.643076661610197e-06, + "loss": 0.9901, + "step": 3102 + }, + { + "epoch": 0.5261551504874947, + "grad_norm": 0.9657160832553565, + "learning_rate": 9.63758792042008e-06, + "loss": 0.9428, + "step": 3103 + }, + { + "epoch": 0.5263247138618059, + "grad_norm": 0.9936139466607169, + "learning_rate": 9.632099288552657e-06, + "loss": 0.9935, + "step": 3104 + }, + { + "epoch": 0.526494277236117, + "grad_norm": 1.0295104051542312, + "learning_rate": 9.6266107676636e-06, + "loss": 0.9641, + "step": 3105 + }, + { + "epoch": 0.5266638406104281, + "grad_norm": 0.9720985075142662, + "learning_rate": 9.62112235940853e-06, + "loss": 0.9647, + "step": 3106 + }, + { + "epoch": 0.5268334039847393, + "grad_norm": 0.9397754196534936, + "learning_rate": 9.615634065443044e-06, + "loss": 0.9409, + "step": 3107 + }, + { + "epoch": 0.5270029673590505, + "grad_norm": 1.00418036818939, + "learning_rate": 9.610145887422703e-06, + "loss": 0.9507, + "step": 3108 + }, + { + "epoch": 0.5271725307333616, + "grad_norm": 0.9654893111080153, + "learning_rate": 9.604657827003033e-06, + "loss": 0.9517, + "step": 3109 + }, + { + "epoch": 0.5273420941076727, + "grad_norm": 0.9680503255523386, + "learning_rate": 9.599169885839523e-06, + "loss": 0.9578, + "step": 3110 + }, + { + "epoch": 0.5275116574819839, + "grad_norm": 0.9512270571636512, + "learning_rate": 9.593682065587625e-06, + "loss": 0.9701, + "step": 3111 + }, + { + "epoch": 0.527681220856295, + "grad_norm": 1.000108982315616, + "learning_rate": 9.588194367902761e-06, + "loss": 0.9728, + "step": 3112 + }, + { + "epoch": 0.5278507842306062, + "grad_norm": 0.9172975872203326, + "learning_rate": 9.58270679444031e-06, + "loss": 0.9514, + "step": 3113 + }, + { + "epoch": 0.5280203476049173, + "grad_norm": 0.9279876688315852, + "learning_rate": 9.577219346855613e-06, + "loss": 0.9692, + "step": 3114 + }, + { + "epoch": 0.5281899109792285, + "grad_norm": 0.9608482931857734, + "learning_rate": 9.571732026803978e-06, + "loss": 0.9724, + "step": 3115 + }, + { + "epoch": 0.5283594743535396, + "grad_norm": 1.030356602281444, + "learning_rate": 9.566244835940668e-06, + "loss": 0.9946, + "step": 3116 + }, + { + "epoch": 0.5285290377278508, + "grad_norm": 0.9386481852759996, + "learning_rate": 9.560757775920917e-06, + "loss": 0.9533, + "step": 3117 + }, + { + "epoch": 0.5286986011021619, + "grad_norm": 0.9592496648790197, + "learning_rate": 9.555270848399902e-06, + "loss": 0.9567, + "step": 3118 + }, + { + "epoch": 0.5288681644764731, + "grad_norm": 1.0126635642805253, + "learning_rate": 9.549784055032785e-06, + "loss": 0.9814, + "step": 3119 + }, + { + "epoch": 0.5290377278507842, + "grad_norm": 0.9587447459707307, + "learning_rate": 9.544297397474665e-06, + "loss": 0.9446, + "step": 3120 + }, + { + "epoch": 0.5292072912250954, + "grad_norm": 0.5734116702195434, + "learning_rate": 9.538810877380611e-06, + "loss": 0.766, + "step": 3121 + }, + { + "epoch": 0.5293768545994065, + "grad_norm": 0.9037505519348362, + "learning_rate": 9.533324496405647e-06, + "loss": 0.8945, + "step": 3122 + }, + { + "epoch": 0.5295464179737177, + "grad_norm": 0.9697862037813453, + "learning_rate": 9.52783825620476e-06, + "loss": 0.9534, + "step": 3123 + }, + { + "epoch": 0.5297159813480288, + "grad_norm": 0.9972142728693013, + "learning_rate": 9.522352158432889e-06, + "loss": 0.9655, + "step": 3124 + }, + { + "epoch": 0.52988554472234, + "grad_norm": 1.0022756206960943, + "learning_rate": 9.516866204744932e-06, + "loss": 0.9301, + "step": 3125 + }, + { + "epoch": 0.5300551080966511, + "grad_norm": 1.0229288603992286, + "learning_rate": 9.511380396795739e-06, + "loss": 1.0038, + "step": 3126 + }, + { + "epoch": 0.5302246714709623, + "grad_norm": 0.9491599002275521, + "learning_rate": 9.50589473624013e-06, + "loss": 0.939, + "step": 3127 + }, + { + "epoch": 0.5303942348452734, + "grad_norm": 0.9551154693589398, + "learning_rate": 9.500409224732863e-06, + "loss": 0.9466, + "step": 3128 + }, + { + "epoch": 0.5305637982195845, + "grad_norm": 0.9585231766092215, + "learning_rate": 9.49492386392866e-06, + "loss": 0.9607, + "step": 3129 + }, + { + "epoch": 0.5307333615938957, + "grad_norm": 1.0014943233391826, + "learning_rate": 9.489438655482201e-06, + "loss": 0.9353, + "step": 3130 + }, + { + "epoch": 0.5309029249682069, + "grad_norm": 0.9649535412939261, + "learning_rate": 9.483953601048116e-06, + "loss": 0.9572, + "step": 3131 + }, + { + "epoch": 0.531072488342518, + "grad_norm": 0.5954540657336224, + "learning_rate": 9.478468702280981e-06, + "loss": 0.8185, + "step": 3132 + }, + { + "epoch": 0.5312420517168291, + "grad_norm": 0.9948438984499713, + "learning_rate": 9.472983960835338e-06, + "loss": 0.9707, + "step": 3133 + }, + { + "epoch": 0.5314116150911403, + "grad_norm": 0.608261270781024, + "learning_rate": 9.467499378365675e-06, + "loss": 0.7614, + "step": 3134 + }, + { + "epoch": 0.5315811784654515, + "grad_norm": 0.9663545095139282, + "learning_rate": 9.462014956526433e-06, + "loss": 0.9818, + "step": 3135 + }, + { + "epoch": 0.5317507418397626, + "grad_norm": 0.979400893361934, + "learning_rate": 9.456530696972e-06, + "loss": 1.0009, + "step": 3136 + }, + { + "epoch": 0.5319203052140737, + "grad_norm": 0.9680298298339641, + "learning_rate": 9.451046601356725e-06, + "loss": 0.953, + "step": 3137 + }, + { + "epoch": 0.5320898685883849, + "grad_norm": 1.0595822137042479, + "learning_rate": 9.445562671334903e-06, + "loss": 1.0015, + "step": 3138 + }, + { + "epoch": 0.5322594319626961, + "grad_norm": 0.9868794204913204, + "learning_rate": 9.440078908560776e-06, + "loss": 0.9934, + "step": 3139 + }, + { + "epoch": 0.5324289953370072, + "grad_norm": 0.9686333648816564, + "learning_rate": 9.434595314688531e-06, + "loss": 0.9371, + "step": 3140 + }, + { + "epoch": 0.5325985587113183, + "grad_norm": 0.98517165692237, + "learning_rate": 9.42911189137232e-06, + "loss": 0.9592, + "step": 3141 + }, + { + "epoch": 0.5327681220856295, + "grad_norm": 0.9557894036770974, + "learning_rate": 9.423628640266232e-06, + "loss": 0.9609, + "step": 3142 + }, + { + "epoch": 0.5329376854599407, + "grad_norm": 0.9902892129234298, + "learning_rate": 9.418145563024303e-06, + "loss": 0.9895, + "step": 3143 + }, + { + "epoch": 0.5331072488342518, + "grad_norm": 1.014496521744886, + "learning_rate": 9.412662661300523e-06, + "loss": 0.9996, + "step": 3144 + }, + { + "epoch": 0.5332768122085629, + "grad_norm": 1.0140747696905492, + "learning_rate": 9.407179936748827e-06, + "loss": 1.0109, + "step": 3145 + }, + { + "epoch": 0.5334463755828741, + "grad_norm": 1.0113259951731863, + "learning_rate": 9.40169739102309e-06, + "loss": 0.9836, + "step": 3146 + }, + { + "epoch": 0.5336159389571853, + "grad_norm": 0.9764244353727426, + "learning_rate": 9.39621502577714e-06, + "loss": 0.8975, + "step": 3147 + }, + { + "epoch": 0.5337855023314964, + "grad_norm": 0.9332875412728099, + "learning_rate": 9.390732842664753e-06, + "loss": 0.9862, + "step": 3148 + }, + { + "epoch": 0.5339550657058075, + "grad_norm": 0.9488158876317377, + "learning_rate": 9.385250843339644e-06, + "loss": 0.9633, + "step": 3149 + }, + { + "epoch": 0.5341246290801187, + "grad_norm": 0.9394539013784423, + "learning_rate": 9.379769029455474e-06, + "loss": 0.9513, + "step": 3150 + }, + { + "epoch": 0.5342941924544299, + "grad_norm": 0.9978122925358326, + "learning_rate": 9.374287402665848e-06, + "loss": 0.9405, + "step": 3151 + }, + { + "epoch": 0.534463755828741, + "grad_norm": 0.9838054263417751, + "learning_rate": 9.368805964624318e-06, + "loss": 0.9563, + "step": 3152 + }, + { + "epoch": 0.5346333192030521, + "grad_norm": 0.9591081807956522, + "learning_rate": 9.363324716984375e-06, + "loss": 0.972, + "step": 3153 + }, + { + "epoch": 0.5348028825773633, + "grad_norm": 0.9606133111021694, + "learning_rate": 9.357843661399447e-06, + "loss": 0.9483, + "step": 3154 + }, + { + "epoch": 0.5349724459516745, + "grad_norm": 1.0056177630456957, + "learning_rate": 9.352362799522925e-06, + "loss": 0.956, + "step": 3155 + }, + { + "epoch": 0.5351420093259855, + "grad_norm": 0.9501690288889847, + "learning_rate": 9.34688213300812e-06, + "loss": 0.9346, + "step": 3156 + }, + { + "epoch": 0.5353115727002967, + "grad_norm": 1.001928322467254, + "learning_rate": 9.34140166350829e-06, + "loss": 0.9697, + "step": 3157 + }, + { + "epoch": 0.5354811360746079, + "grad_norm": 0.980340152009788, + "learning_rate": 9.335921392676633e-06, + "loss": 0.9822, + "step": 3158 + }, + { + "epoch": 0.5356506994489191, + "grad_norm": 0.9094156985985744, + "learning_rate": 9.330441322166297e-06, + "loss": 0.938, + "step": 3159 + }, + { + "epoch": 0.5358202628232301, + "grad_norm": 0.9587342032860251, + "learning_rate": 9.32496145363036e-06, + "loss": 0.9681, + "step": 3160 + }, + { + "epoch": 0.5359898261975413, + "grad_norm": 0.9424515384650101, + "learning_rate": 9.319481788721833e-06, + "loss": 0.9498, + "step": 3161 + }, + { + "epoch": 0.5361593895718525, + "grad_norm": 0.9408213362217274, + "learning_rate": 9.314002329093684e-06, + "loss": 0.961, + "step": 3162 + }, + { + "epoch": 0.5363289529461637, + "grad_norm": 0.9415935843340619, + "learning_rate": 9.308523076398803e-06, + "loss": 0.9821, + "step": 3163 + }, + { + "epoch": 0.5364985163204747, + "grad_norm": 0.9540494433492968, + "learning_rate": 9.303044032290023e-06, + "loss": 0.9603, + "step": 3164 + }, + { + "epoch": 0.5366680796947859, + "grad_norm": 0.953158803484815, + "learning_rate": 9.297565198420112e-06, + "loss": 0.9487, + "step": 3165 + }, + { + "epoch": 0.5368376430690971, + "grad_norm": 0.9656874069082125, + "learning_rate": 9.292086576441784e-06, + "loss": 0.9607, + "step": 3166 + }, + { + "epoch": 0.5370072064434083, + "grad_norm": 0.8936309077696528, + "learning_rate": 9.286608168007678e-06, + "loss": 0.9236, + "step": 3167 + }, + { + "epoch": 0.5371767698177193, + "grad_norm": 0.935170897507696, + "learning_rate": 9.281129974770372e-06, + "loss": 0.9407, + "step": 3168 + }, + { + "epoch": 0.5373463331920305, + "grad_norm": 0.9213500287711175, + "learning_rate": 9.275651998382377e-06, + "loss": 0.9207, + "step": 3169 + }, + { + "epoch": 0.5375158965663417, + "grad_norm": 0.95949053164204, + "learning_rate": 9.270174240496147e-06, + "loss": 0.9731, + "step": 3170 + }, + { + "epoch": 0.5376854599406529, + "grad_norm": 0.8932953520038451, + "learning_rate": 9.264696702764058e-06, + "loss": 0.9057, + "step": 3171 + }, + { + "epoch": 0.5378550233149639, + "grad_norm": 0.9555131490695726, + "learning_rate": 9.25921938683843e-06, + "loss": 0.972, + "step": 3172 + }, + { + "epoch": 0.5380245866892751, + "grad_norm": 0.9427691826625484, + "learning_rate": 9.253742294371512e-06, + "loss": 0.8964, + "step": 3173 + }, + { + "epoch": 0.5381941500635863, + "grad_norm": 0.9624938890211117, + "learning_rate": 9.248265427015486e-06, + "loss": 0.9827, + "step": 3174 + }, + { + "epoch": 0.5383637134378975, + "grad_norm": 0.9744558126028329, + "learning_rate": 9.24278878642246e-06, + "loss": 0.9622, + "step": 3175 + }, + { + "epoch": 0.5385332768122085, + "grad_norm": 1.049256052471717, + "learning_rate": 9.237312374244482e-06, + "loss": 0.9656, + "step": 3176 + }, + { + "epoch": 0.5387028401865197, + "grad_norm": 0.9388973685122264, + "learning_rate": 9.231836192133532e-06, + "loss": 0.9773, + "step": 3177 + }, + { + "epoch": 0.5388724035608309, + "grad_norm": 1.0299849643327963, + "learning_rate": 9.226360241741515e-06, + "loss": 0.9477, + "step": 3178 + }, + { + "epoch": 0.5390419669351421, + "grad_norm": 1.038873807945864, + "learning_rate": 9.220884524720265e-06, + "loss": 0.9722, + "step": 3179 + }, + { + "epoch": 0.5392115303094531, + "grad_norm": 0.9982347659838471, + "learning_rate": 9.215409042721553e-06, + "loss": 0.9635, + "step": 3180 + }, + { + "epoch": 0.5393810936837643, + "grad_norm": 0.9981752028964325, + "learning_rate": 9.20993379739707e-06, + "loss": 1.0005, + "step": 3181 + }, + { + "epoch": 0.5395506570580755, + "grad_norm": 0.9504701891411228, + "learning_rate": 9.204458790398446e-06, + "loss": 0.9732, + "step": 3182 + }, + { + "epoch": 0.5397202204323867, + "grad_norm": 0.9164495220540729, + "learning_rate": 9.198984023377222e-06, + "loss": 0.9485, + "step": 3183 + }, + { + "epoch": 0.5398897838066977, + "grad_norm": 0.9592104408855677, + "learning_rate": 9.193509497984892e-06, + "loss": 0.9495, + "step": 3184 + }, + { + "epoch": 0.5400593471810089, + "grad_norm": 0.9432299738872578, + "learning_rate": 9.188035215872858e-06, + "loss": 0.9553, + "step": 3185 + }, + { + "epoch": 0.5402289105553201, + "grad_norm": 0.9978838868228843, + "learning_rate": 9.182561178692453e-06, + "loss": 0.9198, + "step": 3186 + }, + { + "epoch": 0.5403984739296313, + "grad_norm": 0.6143182536374958, + "learning_rate": 9.177087388094931e-06, + "loss": 0.7753, + "step": 3187 + }, + { + "epoch": 0.5405680373039423, + "grad_norm": 0.9604696315916927, + "learning_rate": 9.171613845731489e-06, + "loss": 0.9546, + "step": 3188 + }, + { + "epoch": 0.5407376006782535, + "grad_norm": 0.9815014395974875, + "learning_rate": 9.16614055325323e-06, + "loss": 0.9706, + "step": 3189 + }, + { + "epoch": 0.5409071640525647, + "grad_norm": 0.970522313416515, + "learning_rate": 9.16066751231119e-06, + "loss": 0.9785, + "step": 3190 + }, + { + "epoch": 0.5410767274268758, + "grad_norm": 1.0064858052398356, + "learning_rate": 9.15519472455633e-06, + "loss": 0.9998, + "step": 3191 + }, + { + "epoch": 0.5412462908011869, + "grad_norm": 0.9396538607117793, + "learning_rate": 9.149722191639534e-06, + "loss": 0.9318, + "step": 3192 + }, + { + "epoch": 0.5414158541754981, + "grad_norm": 0.988232600138124, + "learning_rate": 9.144249915211605e-06, + "loss": 0.9932, + "step": 3193 + }, + { + "epoch": 0.5415854175498093, + "grad_norm": 0.9236500547872332, + "learning_rate": 9.13877789692327e-06, + "loss": 0.935, + "step": 3194 + }, + { + "epoch": 0.5417549809241204, + "grad_norm": 0.9618613133695751, + "learning_rate": 9.133306138425186e-06, + "loss": 0.967, + "step": 3195 + }, + { + "epoch": 0.5419245442984315, + "grad_norm": 1.0072618256658485, + "learning_rate": 9.127834641367924e-06, + "loss": 0.9615, + "step": 3196 + }, + { + "epoch": 0.5420941076727427, + "grad_norm": 0.9072413725698752, + "learning_rate": 9.122363407401971e-06, + "loss": 0.9374, + "step": 3197 + }, + { + "epoch": 0.5422636710470539, + "grad_norm": 1.0080042482678069, + "learning_rate": 9.11689243817775e-06, + "loss": 1.0207, + "step": 3198 + }, + { + "epoch": 0.542433234421365, + "grad_norm": 0.9614582449717499, + "learning_rate": 9.111421735345593e-06, + "loss": 0.993, + "step": 3199 + }, + { + "epoch": 0.5426027977956761, + "grad_norm": 0.9359002999365073, + "learning_rate": 9.10595130055575e-06, + "loss": 0.9017, + "step": 3200 + }, + { + "epoch": 0.5427723611699873, + "grad_norm": 0.9557910737230959, + "learning_rate": 9.100481135458393e-06, + "loss": 0.9541, + "step": 3201 + }, + { + "epoch": 0.5429419245442985, + "grad_norm": 0.9548124066230677, + "learning_rate": 9.095011241703623e-06, + "loss": 0.9664, + "step": 3202 + }, + { + "epoch": 0.5431114879186095, + "grad_norm": 0.9861999954084409, + "learning_rate": 9.089541620941443e-06, + "loss": 0.9865, + "step": 3203 + }, + { + "epoch": 0.5432810512929207, + "grad_norm": 0.5916469552588054, + "learning_rate": 9.084072274821783e-06, + "loss": 0.776, + "step": 3204 + }, + { + "epoch": 0.5434506146672319, + "grad_norm": 0.9700546761275866, + "learning_rate": 9.078603204994484e-06, + "loss": 0.9661, + "step": 3205 + }, + { + "epoch": 0.5436201780415431, + "grad_norm": 0.9785307286392297, + "learning_rate": 9.073134413109313e-06, + "loss": 0.9593, + "step": 3206 + }, + { + "epoch": 0.5437897414158541, + "grad_norm": 0.9971613670262534, + "learning_rate": 9.067665900815945e-06, + "loss": 0.9566, + "step": 3207 + }, + { + "epoch": 0.5439593047901653, + "grad_norm": 0.9610277355221418, + "learning_rate": 9.06219766976397e-06, + "loss": 0.9812, + "step": 3208 + }, + { + "epoch": 0.5441288681644765, + "grad_norm": 0.9785820995579787, + "learning_rate": 9.056729721602904e-06, + "loss": 0.9607, + "step": 3209 + }, + { + "epoch": 0.5442984315387877, + "grad_norm": 0.9610424481457893, + "learning_rate": 9.051262057982165e-06, + "loss": 0.967, + "step": 3210 + }, + { + "epoch": 0.5444679949130987, + "grad_norm": 0.9127160968897032, + "learning_rate": 9.04579468055109e-06, + "loss": 0.9483, + "step": 3211 + }, + { + "epoch": 0.5446375582874099, + "grad_norm": 1.018936165202702, + "learning_rate": 9.040327590958929e-06, + "loss": 0.9732, + "step": 3212 + }, + { + "epoch": 0.5448071216617211, + "grad_norm": 0.9288802712649358, + "learning_rate": 9.034860790854848e-06, + "loss": 0.9436, + "step": 3213 + }, + { + "epoch": 0.5449766850360322, + "grad_norm": 0.9614540669930163, + "learning_rate": 9.029394281887927e-06, + "loss": 0.9518, + "step": 3214 + }, + { + "epoch": 0.5451462484103433, + "grad_norm": 0.971333529820219, + "learning_rate": 9.02392806570715e-06, + "loss": 0.9531, + "step": 3215 + }, + { + "epoch": 0.5453158117846545, + "grad_norm": 0.9343879263667006, + "learning_rate": 9.01846214396142e-06, + "loss": 0.9616, + "step": 3216 + }, + { + "epoch": 0.5454853751589657, + "grad_norm": 0.9848840367297588, + "learning_rate": 9.012996518299547e-06, + "loss": 0.98, + "step": 3217 + }, + { + "epoch": 0.5456549385332768, + "grad_norm": 0.9997035590913115, + "learning_rate": 9.007531190370256e-06, + "loss": 0.9429, + "step": 3218 + }, + { + "epoch": 0.5458245019075879, + "grad_norm": 0.6256102935222682, + "learning_rate": 9.002066161822174e-06, + "loss": 0.7813, + "step": 3219 + }, + { + "epoch": 0.5459940652818991, + "grad_norm": 0.9517302996060777, + "learning_rate": 8.99660143430385e-06, + "loss": 0.9626, + "step": 3220 + }, + { + "epoch": 0.5461636286562103, + "grad_norm": 0.931969385105438, + "learning_rate": 8.991137009463735e-06, + "loss": 1.0092, + "step": 3221 + }, + { + "epoch": 0.5463331920305214, + "grad_norm": 0.9332527093376284, + "learning_rate": 8.985672888950186e-06, + "loss": 0.9688, + "step": 3222 + }, + { + "epoch": 0.5465027554048325, + "grad_norm": 0.9833687496890873, + "learning_rate": 8.980209074411469e-06, + "loss": 0.9739, + "step": 3223 + }, + { + "epoch": 0.5466723187791437, + "grad_norm": 0.9463255894954833, + "learning_rate": 8.974745567495768e-06, + "loss": 0.988, + "step": 3224 + }, + { + "epoch": 0.5468418821534549, + "grad_norm": 0.939352245355736, + "learning_rate": 8.969282369851163e-06, + "loss": 0.9423, + "step": 3225 + }, + { + "epoch": 0.547011445527766, + "grad_norm": 0.9309724381358758, + "learning_rate": 8.963819483125642e-06, + "loss": 0.9529, + "step": 3226 + }, + { + "epoch": 0.5471810089020771, + "grad_norm": 0.9335742657859284, + "learning_rate": 8.958356908967104e-06, + "loss": 0.9546, + "step": 3227 + }, + { + "epoch": 0.5473505722763883, + "grad_norm": 0.9736911346638419, + "learning_rate": 8.952894649023348e-06, + "loss": 0.9486, + "step": 3228 + }, + { + "epoch": 0.5475201356506995, + "grad_norm": 0.9527883544967858, + "learning_rate": 8.947432704942085e-06, + "loss": 0.959, + "step": 3229 + }, + { + "epoch": 0.5476896990250106, + "grad_norm": 0.9312295195517352, + "learning_rate": 8.94197107837092e-06, + "loss": 0.9481, + "step": 3230 + }, + { + "epoch": 0.5478592623993217, + "grad_norm": 0.9594724220331561, + "learning_rate": 8.936509770957377e-06, + "loss": 0.9518, + "step": 3231 + }, + { + "epoch": 0.5480288257736329, + "grad_norm": 0.9431230233611217, + "learning_rate": 8.931048784348875e-06, + "loss": 0.9311, + "step": 3232 + }, + { + "epoch": 0.548198389147944, + "grad_norm": 0.9464773100937168, + "learning_rate": 8.92558812019273e-06, + "loss": 0.9446, + "step": 3233 + }, + { + "epoch": 0.5483679525222552, + "grad_norm": 0.9628701084528413, + "learning_rate": 8.920127780136177e-06, + "loss": 0.9982, + "step": 3234 + }, + { + "epoch": 0.5485375158965663, + "grad_norm": 0.961296693273085, + "learning_rate": 8.91466776582634e-06, + "loss": 0.9466, + "step": 3235 + }, + { + "epoch": 0.5487070792708775, + "grad_norm": 1.0040081228225466, + "learning_rate": 8.909208078910246e-06, + "loss": 0.9746, + "step": 3236 + }, + { + "epoch": 0.5488766426451887, + "grad_norm": 0.9416099883424593, + "learning_rate": 8.903748721034827e-06, + "loss": 0.993, + "step": 3237 + }, + { + "epoch": 0.5490462060194998, + "grad_norm": 0.9669934175172287, + "learning_rate": 8.89828969384692e-06, + "loss": 0.9672, + "step": 3238 + }, + { + "epoch": 0.5492157693938109, + "grad_norm": 0.9980527104969046, + "learning_rate": 8.892830998993253e-06, + "loss": 0.9456, + "step": 3239 + }, + { + "epoch": 0.5493853327681221, + "grad_norm": 0.9810865142720385, + "learning_rate": 8.887372638120459e-06, + "loss": 0.9521, + "step": 3240 + }, + { + "epoch": 0.5495548961424332, + "grad_norm": 0.9830904216919492, + "learning_rate": 8.881914612875062e-06, + "loss": 0.9766, + "step": 3241 + }, + { + "epoch": 0.5497244595167444, + "grad_norm": 0.9387194021504626, + "learning_rate": 8.876456924903505e-06, + "loss": 0.942, + "step": 3242 + }, + { + "epoch": 0.5498940228910555, + "grad_norm": 0.936579158480159, + "learning_rate": 8.870999575852108e-06, + "loss": 0.9482, + "step": 3243 + }, + { + "epoch": 0.5500635862653667, + "grad_norm": 0.9514138273405043, + "learning_rate": 8.865542567367096e-06, + "loss": 0.9628, + "step": 3244 + }, + { + "epoch": 0.5502331496396778, + "grad_norm": 0.9781603168874184, + "learning_rate": 8.860085901094595e-06, + "loss": 0.9689, + "step": 3245 + }, + { + "epoch": 0.550402713013989, + "grad_norm": 0.9645598937881817, + "learning_rate": 8.854629578680625e-06, + "loss": 0.9816, + "step": 3246 + }, + { + "epoch": 0.5505722763883001, + "grad_norm": 0.9785156679703835, + "learning_rate": 8.849173601771101e-06, + "loss": 0.9678, + "step": 3247 + }, + { + "epoch": 0.5507418397626113, + "grad_norm": 0.9576872279764431, + "learning_rate": 8.84371797201183e-06, + "loss": 0.9473, + "step": 3248 + }, + { + "epoch": 0.5509114031369224, + "grad_norm": 0.9435233766027598, + "learning_rate": 8.838262691048529e-06, + "loss": 0.9481, + "step": 3249 + }, + { + "epoch": 0.5510809665112336, + "grad_norm": 0.9271689747735359, + "learning_rate": 8.832807760526796e-06, + "loss": 0.9424, + "step": 3250 + }, + { + "epoch": 0.5512505298855447, + "grad_norm": 0.9343325739945719, + "learning_rate": 8.827353182092123e-06, + "loss": 0.9391, + "step": 3251 + }, + { + "epoch": 0.5514200932598559, + "grad_norm": 0.9749226501850451, + "learning_rate": 8.821898957389906e-06, + "loss": 0.9569, + "step": 3252 + }, + { + "epoch": 0.551589656634167, + "grad_norm": 0.8984199695950069, + "learning_rate": 8.816445088065425e-06, + "loss": 0.9358, + "step": 3253 + }, + { + "epoch": 0.5517592200084782, + "grad_norm": 0.9975603942053386, + "learning_rate": 8.810991575763857e-06, + "loss": 0.947, + "step": 3254 + }, + { + "epoch": 0.5519287833827893, + "grad_norm": 0.9386562650616195, + "learning_rate": 8.805538422130268e-06, + "loss": 0.9723, + "step": 3255 + }, + { + "epoch": 0.5520983467571005, + "grad_norm": 0.9317845688007161, + "learning_rate": 8.800085628809623e-06, + "loss": 0.9755, + "step": 3256 + }, + { + "epoch": 0.5522679101314116, + "grad_norm": 0.9692964648436307, + "learning_rate": 8.79463319744677e-06, + "loss": 0.9463, + "step": 3257 + }, + { + "epoch": 0.5524374735057228, + "grad_norm": 0.9410608044864087, + "learning_rate": 8.789181129686452e-06, + "loss": 0.922, + "step": 3258 + }, + { + "epoch": 0.5526070368800339, + "grad_norm": 0.99818905255236, + "learning_rate": 8.783729427173304e-06, + "loss": 0.9579, + "step": 3259 + }, + { + "epoch": 0.552776600254345, + "grad_norm": 1.0155599095629428, + "learning_rate": 8.778278091551848e-06, + "loss": 0.973, + "step": 3260 + }, + { + "epoch": 0.5529461636286562, + "grad_norm": 0.9806864665825908, + "learning_rate": 8.772827124466495e-06, + "loss": 0.9701, + "step": 3261 + }, + { + "epoch": 0.5531157270029674, + "grad_norm": 0.9856547893160948, + "learning_rate": 8.767376527561542e-06, + "loss": 0.9755, + "step": 3262 + }, + { + "epoch": 0.5532852903772785, + "grad_norm": 0.955575853195002, + "learning_rate": 8.761926302481182e-06, + "loss": 0.9187, + "step": 3263 + }, + { + "epoch": 0.5534548537515896, + "grad_norm": 0.9847088879285761, + "learning_rate": 8.756476450869494e-06, + "loss": 0.968, + "step": 3264 + }, + { + "epoch": 0.5536244171259008, + "grad_norm": 0.9452515260850247, + "learning_rate": 8.751026974370438e-06, + "loss": 0.9258, + "step": 3265 + }, + { + "epoch": 0.553793980500212, + "grad_norm": 1.0570672541721595, + "learning_rate": 8.745577874627863e-06, + "loss": 0.9873, + "step": 3266 + }, + { + "epoch": 0.5539635438745231, + "grad_norm": 0.9717780614054712, + "learning_rate": 8.740129153285513e-06, + "loss": 0.9734, + "step": 3267 + }, + { + "epoch": 0.5541331072488342, + "grad_norm": 0.907675161376436, + "learning_rate": 8.73468081198701e-06, + "loss": 0.9261, + "step": 3268 + }, + { + "epoch": 0.5543026706231454, + "grad_norm": 0.9721569224983991, + "learning_rate": 8.729232852375855e-06, + "loss": 1.0044, + "step": 3269 + }, + { + "epoch": 0.5544722339974566, + "grad_norm": 0.8998798179279173, + "learning_rate": 8.723785276095451e-06, + "loss": 0.9248, + "step": 3270 + }, + { + "epoch": 0.5546417973717677, + "grad_norm": 0.9852597520905083, + "learning_rate": 8.718338084789074e-06, + "loss": 0.9983, + "step": 3271 + }, + { + "epoch": 0.5548113607460788, + "grad_norm": 0.9666100426462805, + "learning_rate": 8.712891280099882e-06, + "loss": 0.9468, + "step": 3272 + }, + { + "epoch": 0.55498092412039, + "grad_norm": 1.000679104836889, + "learning_rate": 8.70744486367092e-06, + "loss": 0.9802, + "step": 3273 + }, + { + "epoch": 0.5551504874947012, + "grad_norm": 0.9655056714040969, + "learning_rate": 8.701998837145119e-06, + "loss": 0.9856, + "step": 3274 + }, + { + "epoch": 0.5553200508690123, + "grad_norm": 0.9045761781178759, + "learning_rate": 8.69655320216529e-06, + "loss": 0.9269, + "step": 3275 + }, + { + "epoch": 0.5554896142433234, + "grad_norm": 0.9804541193730871, + "learning_rate": 8.691107960374117e-06, + "loss": 0.9319, + "step": 3276 + }, + { + "epoch": 0.5556591776176346, + "grad_norm": 0.9739095872897207, + "learning_rate": 8.685663113414186e-06, + "loss": 0.9554, + "step": 3277 + }, + { + "epoch": 0.5558287409919458, + "grad_norm": 0.9064579283101131, + "learning_rate": 8.680218662927944e-06, + "loss": 0.9255, + "step": 3278 + }, + { + "epoch": 0.5559983043662569, + "grad_norm": 0.600655944776659, + "learning_rate": 8.674774610557728e-06, + "loss": 0.789, + "step": 3279 + }, + { + "epoch": 0.556167867740568, + "grad_norm": 0.6520002026944826, + "learning_rate": 8.66933095794575e-06, + "loss": 0.836, + "step": 3280 + }, + { + "epoch": 0.5563374311148792, + "grad_norm": 0.9779765117275315, + "learning_rate": 8.66388770673411e-06, + "loss": 0.947, + "step": 3281 + }, + { + "epoch": 0.5565069944891904, + "grad_norm": 0.9536049964424435, + "learning_rate": 8.658444858564774e-06, + "loss": 0.9295, + "step": 3282 + }, + { + "epoch": 0.5566765578635015, + "grad_norm": 1.0439558600531138, + "learning_rate": 8.6530024150796e-06, + "loss": 1.0026, + "step": 3283 + }, + { + "epoch": 0.5568461212378126, + "grad_norm": 0.9756079126914644, + "learning_rate": 8.647560377920311e-06, + "loss": 0.9126, + "step": 3284 + }, + { + "epoch": 0.5570156846121238, + "grad_norm": 0.985298557273098, + "learning_rate": 8.64211874872852e-06, + "loss": 0.9384, + "step": 3285 + }, + { + "epoch": 0.557185247986435, + "grad_norm": 0.9825517832707542, + "learning_rate": 8.63667752914571e-06, + "loss": 0.9625, + "step": 3286 + }, + { + "epoch": 0.557354811360746, + "grad_norm": 0.9223060319293799, + "learning_rate": 8.631236720813237e-06, + "loss": 0.9491, + "step": 3287 + }, + { + "epoch": 0.5575243747350572, + "grad_norm": 0.9861505208807566, + "learning_rate": 8.625796325372342e-06, + "loss": 0.9718, + "step": 3288 + }, + { + "epoch": 0.5576939381093684, + "grad_norm": 0.9727998566983641, + "learning_rate": 8.620356344464135e-06, + "loss": 0.9151, + "step": 3289 + }, + { + "epoch": 0.5578635014836796, + "grad_norm": 0.9943456981460446, + "learning_rate": 8.614916779729603e-06, + "loss": 0.986, + "step": 3290 + }, + { + "epoch": 0.5580330648579906, + "grad_norm": 1.0006436142538297, + "learning_rate": 8.609477632809604e-06, + "loss": 0.9797, + "step": 3291 + }, + { + "epoch": 0.5582026282323018, + "grad_norm": 0.9849960049022215, + "learning_rate": 8.604038905344879e-06, + "loss": 0.9683, + "step": 3292 + }, + { + "epoch": 0.558372191606613, + "grad_norm": 0.9790641467557808, + "learning_rate": 8.598600598976033e-06, + "loss": 0.9631, + "step": 3293 + }, + { + "epoch": 0.5585417549809241, + "grad_norm": 0.9289067313542648, + "learning_rate": 8.593162715343543e-06, + "loss": 0.956, + "step": 3294 + }, + { + "epoch": 0.5587113183552352, + "grad_norm": 0.9955027413227778, + "learning_rate": 8.587725256087771e-06, + "loss": 0.9424, + "step": 3295 + }, + { + "epoch": 0.5588808817295464, + "grad_norm": 0.9361243826157685, + "learning_rate": 8.582288222848942e-06, + "loss": 0.9516, + "step": 3296 + }, + { + "epoch": 0.5590504451038576, + "grad_norm": 0.977667619187527, + "learning_rate": 8.576851617267151e-06, + "loss": 0.976, + "step": 3297 + }, + { + "epoch": 0.5592200084781687, + "grad_norm": 0.9664295663231345, + "learning_rate": 8.571415440982364e-06, + "loss": 0.9825, + "step": 3298 + }, + { + "epoch": 0.5593895718524798, + "grad_norm": 0.9551142769696808, + "learning_rate": 8.565979695634426e-06, + "loss": 0.9864, + "step": 3299 + }, + { + "epoch": 0.559559135226791, + "grad_norm": 0.9446150664079876, + "learning_rate": 8.56054438286304e-06, + "loss": 0.9605, + "step": 3300 + }, + { + "epoch": 0.5597286986011022, + "grad_norm": 0.961298702731066, + "learning_rate": 8.55510950430779e-06, + "loss": 0.9627, + "step": 3301 + }, + { + "epoch": 0.5598982619754133, + "grad_norm": 0.9558230934447186, + "learning_rate": 8.549675061608117e-06, + "loss": 0.9734, + "step": 3302 + }, + { + "epoch": 0.5600678253497244, + "grad_norm": 0.9742786830999808, + "learning_rate": 8.544241056403344e-06, + "loss": 0.9359, + "step": 3303 + }, + { + "epoch": 0.5602373887240356, + "grad_norm": 0.9073722060064696, + "learning_rate": 8.538807490332653e-06, + "loss": 0.9078, + "step": 3304 + }, + { + "epoch": 0.5604069520983468, + "grad_norm": 0.9525767751923666, + "learning_rate": 8.533374365035089e-06, + "loss": 0.9622, + "step": 3305 + }, + { + "epoch": 0.5605765154726579, + "grad_norm": 0.9714734901097564, + "learning_rate": 8.52794168214958e-06, + "loss": 0.9716, + "step": 3306 + }, + { + "epoch": 0.560746078846969, + "grad_norm": 0.9566782567470775, + "learning_rate": 8.52250944331491e-06, + "loss": 0.9319, + "step": 3307 + }, + { + "epoch": 0.5609156422212802, + "grad_norm": 0.9793017871204114, + "learning_rate": 8.517077650169724e-06, + "loss": 0.9402, + "step": 3308 + }, + { + "epoch": 0.5610852055955914, + "grad_norm": 0.9599908859188653, + "learning_rate": 8.511646304352544e-06, + "loss": 0.958, + "step": 3309 + }, + { + "epoch": 0.5612547689699025, + "grad_norm": 0.9744963651654486, + "learning_rate": 8.50621540750175e-06, + "loss": 0.9818, + "step": 3310 + }, + { + "epoch": 0.5614243323442136, + "grad_norm": 0.9638906191822316, + "learning_rate": 8.50078496125559e-06, + "loss": 0.9627, + "step": 3311 + }, + { + "epoch": 0.5615938957185248, + "grad_norm": 1.0335600205515596, + "learning_rate": 8.49535496725217e-06, + "loss": 0.9818, + "step": 3312 + }, + { + "epoch": 0.561763459092836, + "grad_norm": 0.9470592056742406, + "learning_rate": 8.489925427129469e-06, + "loss": 0.9433, + "step": 3313 + }, + { + "epoch": 0.561933022467147, + "grad_norm": 0.9630904631982459, + "learning_rate": 8.484496342525325e-06, + "loss": 0.9667, + "step": 3314 + }, + { + "epoch": 0.5621025858414582, + "grad_norm": 0.9797786858758676, + "learning_rate": 8.479067715077435e-06, + "loss": 0.9482, + "step": 3315 + }, + { + "epoch": 0.5622721492157694, + "grad_norm": 0.9875586320081415, + "learning_rate": 8.473639546423358e-06, + "loss": 0.9532, + "step": 3316 + }, + { + "epoch": 0.5624417125900806, + "grad_norm": 0.978358266211283, + "learning_rate": 8.468211838200523e-06, + "loss": 0.9453, + "step": 3317 + }, + { + "epoch": 0.5626112759643916, + "grad_norm": 0.9859125172498366, + "learning_rate": 8.462784592046212e-06, + "loss": 0.9734, + "step": 3318 + }, + { + "epoch": 0.5627808393387028, + "grad_norm": 1.0042691411047522, + "learning_rate": 8.457357809597571e-06, + "loss": 0.9502, + "step": 3319 + }, + { + "epoch": 0.562950402713014, + "grad_norm": 0.6499226110680854, + "learning_rate": 8.4519314924916e-06, + "loss": 0.8187, + "step": 3320 + }, + { + "epoch": 0.5631199660873252, + "grad_norm": 0.9633707636307078, + "learning_rate": 8.446505642365174e-06, + "loss": 0.9299, + "step": 3321 + }, + { + "epoch": 0.5632895294616362, + "grad_norm": 1.019429594447877, + "learning_rate": 8.44108026085501e-06, + "loss": 0.9615, + "step": 3322 + }, + { + "epoch": 0.5634590928359474, + "grad_norm": 0.9605392324906264, + "learning_rate": 8.43565534959769e-06, + "loss": 0.9322, + "step": 3323 + }, + { + "epoch": 0.5636286562102586, + "grad_norm": 0.9429404524863974, + "learning_rate": 8.430230910229662e-06, + "loss": 0.9289, + "step": 3324 + }, + { + "epoch": 0.5637982195845698, + "grad_norm": 0.9257498287400514, + "learning_rate": 8.424806944387219e-06, + "loss": 0.8815, + "step": 3325 + }, + { + "epoch": 0.5639677829588808, + "grad_norm": 0.9483105817363174, + "learning_rate": 8.419383453706516e-06, + "loss": 0.9748, + "step": 3326 + }, + { + "epoch": 0.564137346333192, + "grad_norm": 0.9883552544764169, + "learning_rate": 8.413960439823567e-06, + "loss": 0.9755, + "step": 3327 + }, + { + "epoch": 0.5643069097075032, + "grad_norm": 1.0235268886835087, + "learning_rate": 8.40853790437424e-06, + "loss": 0.9519, + "step": 3328 + }, + { + "epoch": 0.5644764730818144, + "grad_norm": 0.992044632582503, + "learning_rate": 8.40311584899426e-06, + "loss": 0.9377, + "step": 3329 + }, + { + "epoch": 0.5646460364561254, + "grad_norm": 0.6396221583500626, + "learning_rate": 8.397694275319204e-06, + "loss": 0.8205, + "step": 3330 + }, + { + "epoch": 0.5648155998304366, + "grad_norm": 0.903535773676659, + "learning_rate": 8.39227318498451e-06, + "loss": 0.9539, + "step": 3331 + }, + { + "epoch": 0.5649851632047478, + "grad_norm": 0.930047968226833, + "learning_rate": 8.386852579625467e-06, + "loss": 0.9242, + "step": 3332 + }, + { + "epoch": 0.565154726579059, + "grad_norm": 1.0568057248415546, + "learning_rate": 8.381432460877213e-06, + "loss": 1.0074, + "step": 3333 + }, + { + "epoch": 0.56532428995337, + "grad_norm": 0.9377389123973981, + "learning_rate": 8.37601283037474e-06, + "loss": 0.9007, + "step": 3334 + }, + { + "epoch": 0.5654938533276812, + "grad_norm": 0.9488911278192853, + "learning_rate": 8.370593689752905e-06, + "loss": 0.9565, + "step": 3335 + }, + { + "epoch": 0.5656634167019924, + "grad_norm": 0.9650785402173176, + "learning_rate": 8.365175040646403e-06, + "loss": 0.972, + "step": 3336 + }, + { + "epoch": 0.5658329800763036, + "grad_norm": 0.9832293633440671, + "learning_rate": 8.359756884689785e-06, + "loss": 0.9857, + "step": 3337 + }, + { + "epoch": 0.5660025434506146, + "grad_norm": 0.951337711201333, + "learning_rate": 8.354339223517452e-06, + "loss": 1.0001, + "step": 3338 + }, + { + "epoch": 0.5661721068249258, + "grad_norm": 0.926233023741234, + "learning_rate": 8.348922058763667e-06, + "loss": 0.9733, + "step": 3339 + }, + { + "epoch": 0.566341670199237, + "grad_norm": 0.9499865274162356, + "learning_rate": 8.343505392062526e-06, + "loss": 0.9883, + "step": 3340 + }, + { + "epoch": 0.5665112335735482, + "grad_norm": 1.003365566400872, + "learning_rate": 8.338089225047983e-06, + "loss": 1.0004, + "step": 3341 + }, + { + "epoch": 0.5666807969478592, + "grad_norm": 0.9812993023009497, + "learning_rate": 8.332673559353845e-06, + "loss": 0.9554, + "step": 3342 + }, + { + "epoch": 0.5668503603221704, + "grad_norm": 0.9463155243193977, + "learning_rate": 8.327258396613766e-06, + "loss": 0.927, + "step": 3343 + }, + { + "epoch": 0.5670199236964816, + "grad_norm": 0.9915837655586575, + "learning_rate": 8.32184373846124e-06, + "loss": 0.9681, + "step": 3344 + }, + { + "epoch": 0.5671894870707928, + "grad_norm": 0.9888268207601653, + "learning_rate": 8.316429586529616e-06, + "loss": 0.9894, + "step": 3345 + }, + { + "epoch": 0.5673590504451038, + "grad_norm": 0.9572204963855915, + "learning_rate": 8.311015942452091e-06, + "loss": 0.975, + "step": 3346 + }, + { + "epoch": 0.567528613819415, + "grad_norm": 0.6399731668243899, + "learning_rate": 8.30560280786171e-06, + "loss": 0.7878, + "step": 3347 + }, + { + "epoch": 0.5676981771937262, + "grad_norm": 0.6290090000169403, + "learning_rate": 8.300190184391353e-06, + "loss": 0.7805, + "step": 3348 + }, + { + "epoch": 0.5678677405680373, + "grad_norm": 0.9290403433601735, + "learning_rate": 8.294778073673762e-06, + "loss": 0.9716, + "step": 3349 + }, + { + "epoch": 0.5680373039423484, + "grad_norm": 1.0321282648864374, + "learning_rate": 8.289366477341517e-06, + "loss": 0.9697, + "step": 3350 + }, + { + "epoch": 0.5682068673166596, + "grad_norm": 0.9563499507339595, + "learning_rate": 8.283955397027037e-06, + "loss": 0.9488, + "step": 3351 + }, + { + "epoch": 0.5683764306909708, + "grad_norm": 0.9527895390347536, + "learning_rate": 8.278544834362592e-06, + "loss": 0.9348, + "step": 3352 + }, + { + "epoch": 0.568545994065282, + "grad_norm": 0.9447734550187241, + "learning_rate": 8.273134790980295e-06, + "loss": 0.9956, + "step": 3353 + }, + { + "epoch": 0.568715557439593, + "grad_norm": 0.9513491951758145, + "learning_rate": 8.267725268512104e-06, + "loss": 0.9166, + "step": 3354 + }, + { + "epoch": 0.5688851208139042, + "grad_norm": 1.0310956717415067, + "learning_rate": 8.262316268589815e-06, + "loss": 0.9476, + "step": 3355 + }, + { + "epoch": 0.5690546841882154, + "grad_norm": 0.9791170386060745, + "learning_rate": 8.256907792845073e-06, + "loss": 0.9677, + "step": 3356 + }, + { + "epoch": 0.5692242475625265, + "grad_norm": 0.9749652269356058, + "learning_rate": 8.251499842909358e-06, + "loss": 0.9435, + "step": 3357 + }, + { + "epoch": 0.5693938109368376, + "grad_norm": 0.9984432953782569, + "learning_rate": 8.246092420413996e-06, + "loss": 0.9879, + "step": 3358 + }, + { + "epoch": 0.5695633743111488, + "grad_norm": 0.973993978598013, + "learning_rate": 8.240685526990147e-06, + "loss": 0.9389, + "step": 3359 + }, + { + "epoch": 0.56973293768546, + "grad_norm": 1.0096777408756197, + "learning_rate": 8.235279164268823e-06, + "loss": 0.9682, + "step": 3360 + }, + { + "epoch": 0.5699025010597711, + "grad_norm": 0.9568364914035112, + "learning_rate": 8.22987333388087e-06, + "loss": 0.9341, + "step": 3361 + }, + { + "epoch": 0.5700720644340822, + "grad_norm": 0.9520925152229687, + "learning_rate": 8.224468037456969e-06, + "loss": 0.9811, + "step": 3362 + }, + { + "epoch": 0.5702416278083934, + "grad_norm": 0.9440168638043864, + "learning_rate": 8.219063276627646e-06, + "loss": 0.9511, + "step": 3363 + }, + { + "epoch": 0.5704111911827046, + "grad_norm": 1.0079233348432537, + "learning_rate": 8.213659053023263e-06, + "loss": 1.0019, + "step": 3364 + }, + { + "epoch": 0.5705807545570157, + "grad_norm": 0.9502092413812859, + "learning_rate": 8.20825536827402e-06, + "loss": 0.9757, + "step": 3365 + }, + { + "epoch": 0.5707503179313268, + "grad_norm": 0.9557558413318101, + "learning_rate": 8.202852224009955e-06, + "loss": 0.956, + "step": 3366 + }, + { + "epoch": 0.570919881305638, + "grad_norm": 0.9547814830155817, + "learning_rate": 8.197449621860944e-06, + "loss": 0.9495, + "step": 3367 + }, + { + "epoch": 0.5710894446799492, + "grad_norm": 0.9580283694113226, + "learning_rate": 8.192047563456697e-06, + "loss": 0.9601, + "step": 3368 + }, + { + "epoch": 0.5712590080542603, + "grad_norm": 0.9606886317343701, + "learning_rate": 8.186646050426763e-06, + "loss": 0.9359, + "step": 3369 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.9749029796383327, + "learning_rate": 8.181245084400518e-06, + "loss": 0.9339, + "step": 3370 + }, + { + "epoch": 0.5715981348028826, + "grad_norm": 0.9288222010534315, + "learning_rate": 8.175844667007191e-06, + "loss": 0.9703, + "step": 3371 + }, + { + "epoch": 0.5717676981771938, + "grad_norm": 0.9664087517987092, + "learning_rate": 8.170444799875827e-06, + "loss": 0.922, + "step": 3372 + }, + { + "epoch": 0.5719372615515049, + "grad_norm": 0.9926995136485463, + "learning_rate": 8.16504548463531e-06, + "loss": 0.98, + "step": 3373 + }, + { + "epoch": 0.572106824925816, + "grad_norm": 0.9347949466978872, + "learning_rate": 8.159646722914368e-06, + "loss": 0.9647, + "step": 3374 + }, + { + "epoch": 0.5722763883001272, + "grad_norm": 0.9186331637886513, + "learning_rate": 8.154248516341547e-06, + "loss": 0.9234, + "step": 3375 + }, + { + "epoch": 0.5724459516744383, + "grad_norm": 0.9714013758547416, + "learning_rate": 8.148850866545236e-06, + "loss": 0.9778, + "step": 3376 + }, + { + "epoch": 0.5726155150487495, + "grad_norm": 0.9299230138369605, + "learning_rate": 8.143453775153646e-06, + "loss": 0.9405, + "step": 3377 + }, + { + "epoch": 0.5727850784230606, + "grad_norm": 1.012276157389143, + "learning_rate": 8.138057243794834e-06, + "loss": 0.9651, + "step": 3378 + }, + { + "epoch": 0.5729546417973718, + "grad_norm": 0.962102975451542, + "learning_rate": 8.132661274096676e-06, + "loss": 0.98, + "step": 3379 + }, + { + "epoch": 0.5731242051716829, + "grad_norm": 0.9348421325950335, + "learning_rate": 8.127265867686884e-06, + "loss": 0.9499, + "step": 3380 + }, + { + "epoch": 0.5732937685459941, + "grad_norm": 0.9279097201506896, + "learning_rate": 8.121871026192996e-06, + "loss": 0.9339, + "step": 3381 + }, + { + "epoch": 0.5734633319203052, + "grad_norm": 0.9699226917161088, + "learning_rate": 8.116476751242386e-06, + "loss": 0.9532, + "step": 3382 + }, + { + "epoch": 0.5736328952946164, + "grad_norm": 0.8884174528377796, + "learning_rate": 8.11108304446225e-06, + "loss": 0.9116, + "step": 3383 + }, + { + "epoch": 0.5738024586689275, + "grad_norm": 0.9705545532168752, + "learning_rate": 8.105689907479613e-06, + "loss": 0.9845, + "step": 3384 + }, + { + "epoch": 0.5739720220432386, + "grad_norm": 0.9644779930523806, + "learning_rate": 8.100297341921342e-06, + "loss": 0.9539, + "step": 3385 + }, + { + "epoch": 0.5741415854175498, + "grad_norm": 0.9238751569239929, + "learning_rate": 8.094905349414111e-06, + "loss": 0.9683, + "step": 3386 + }, + { + "epoch": 0.574311148791861, + "grad_norm": 0.9905563243323852, + "learning_rate": 8.089513931584437e-06, + "loss": 0.9724, + "step": 3387 + }, + { + "epoch": 0.5744807121661721, + "grad_norm": 0.9833194998413887, + "learning_rate": 8.084123090058646e-06, + "loss": 0.9872, + "step": 3388 + }, + { + "epoch": 0.5746502755404832, + "grad_norm": 1.0046111484536078, + "learning_rate": 8.078732826462917e-06, + "loss": 0.9712, + "step": 3389 + }, + { + "epoch": 0.5748198389147944, + "grad_norm": 1.0034997795590312, + "learning_rate": 8.07334314242323e-06, + "loss": 0.9692, + "step": 3390 + }, + { + "epoch": 0.5749894022891056, + "grad_norm": 0.9167965029448817, + "learning_rate": 8.067954039565402e-06, + "loss": 0.9416, + "step": 3391 + }, + { + "epoch": 0.5751589656634167, + "grad_norm": 0.9312496209230505, + "learning_rate": 8.062565519515072e-06, + "loss": 0.9652, + "step": 3392 + }, + { + "epoch": 0.5753285290377278, + "grad_norm": 0.9593204939327644, + "learning_rate": 8.057177583897704e-06, + "loss": 0.9571, + "step": 3393 + }, + { + "epoch": 0.575498092412039, + "grad_norm": 0.9164609626221537, + "learning_rate": 8.051790234338584e-06, + "loss": 0.9214, + "step": 3394 + }, + { + "epoch": 0.5756676557863502, + "grad_norm": 0.9452808985904894, + "learning_rate": 8.046403472462818e-06, + "loss": 0.9498, + "step": 3395 + }, + { + "epoch": 0.5758372191606613, + "grad_norm": 0.956539693736721, + "learning_rate": 8.041017299895347e-06, + "loss": 0.9547, + "step": 3396 + }, + { + "epoch": 0.5760067825349724, + "grad_norm": 0.9222433050607953, + "learning_rate": 8.035631718260923e-06, + "loss": 0.9739, + "step": 3397 + }, + { + "epoch": 0.5761763459092836, + "grad_norm": 0.9397494698910815, + "learning_rate": 8.030246729184124e-06, + "loss": 0.8946, + "step": 3398 + }, + { + "epoch": 0.5763459092835947, + "grad_norm": 0.9744502547093705, + "learning_rate": 8.024862334289345e-06, + "loss": 0.9309, + "step": 3399 + }, + { + "epoch": 0.5765154726579059, + "grad_norm": 0.9933423687334322, + "learning_rate": 8.019478535200807e-06, + "loss": 0.9898, + "step": 3400 + }, + { + "epoch": 0.576685036032217, + "grad_norm": 0.9519146021318989, + "learning_rate": 8.014095333542548e-06, + "loss": 0.9677, + "step": 3401 + }, + { + "epoch": 0.5768545994065282, + "grad_norm": 0.9887510770362908, + "learning_rate": 8.008712730938426e-06, + "loss": 0.9311, + "step": 3402 + }, + { + "epoch": 0.5770241627808393, + "grad_norm": 1.0400176262061043, + "learning_rate": 8.003330729012124e-06, + "loss": 0.9636, + "step": 3403 + }, + { + "epoch": 0.5771937261551505, + "grad_norm": 1.0136121022209934, + "learning_rate": 7.997949329387138e-06, + "loss": 0.9027, + "step": 3404 + }, + { + "epoch": 0.5773632895294616, + "grad_norm": 0.9759125784771253, + "learning_rate": 7.992568533686782e-06, + "loss": 0.9357, + "step": 3405 + }, + { + "epoch": 0.5775328529037728, + "grad_norm": 0.9521657987607298, + "learning_rate": 7.987188343534184e-06, + "loss": 0.9419, + "step": 3406 + }, + { + "epoch": 0.5777024162780839, + "grad_norm": 0.9482680969680309, + "learning_rate": 7.981808760552305e-06, + "loss": 0.9525, + "step": 3407 + }, + { + "epoch": 0.5778719796523951, + "grad_norm": 0.9492045026487005, + "learning_rate": 7.976429786363906e-06, + "loss": 0.9766, + "step": 3408 + }, + { + "epoch": 0.5780415430267062, + "grad_norm": 1.057205188112986, + "learning_rate": 7.971051422591571e-06, + "loss": 0.962, + "step": 3409 + }, + { + "epoch": 0.5782111064010174, + "grad_norm": 0.7093557363883277, + "learning_rate": 7.965673670857702e-06, + "loss": 0.825, + "step": 3410 + }, + { + "epoch": 0.5783806697753285, + "grad_norm": 1.0249844179693937, + "learning_rate": 7.960296532784515e-06, + "loss": 0.973, + "step": 3411 + }, + { + "epoch": 0.5785502331496397, + "grad_norm": 0.9710465336548872, + "learning_rate": 7.954920009994035e-06, + "loss": 0.9474, + "step": 3412 + }, + { + "epoch": 0.5787197965239508, + "grad_norm": 0.9064305958036404, + "learning_rate": 7.949544104108107e-06, + "loss": 0.9525, + "step": 3413 + }, + { + "epoch": 0.578889359898262, + "grad_norm": 0.9745465837372795, + "learning_rate": 7.944168816748396e-06, + "loss": 0.947, + "step": 3414 + }, + { + "epoch": 0.5790589232725731, + "grad_norm": 0.933589251932011, + "learning_rate": 7.938794149536367e-06, + "loss": 0.9398, + "step": 3415 + }, + { + "epoch": 0.5792284866468843, + "grad_norm": 0.9811964817017039, + "learning_rate": 7.933420104093308e-06, + "loss": 0.9952, + "step": 3416 + }, + { + "epoch": 0.5793980500211954, + "grad_norm": 0.9260992005918665, + "learning_rate": 7.928046682040311e-06, + "loss": 0.9504, + "step": 3417 + }, + { + "epoch": 0.5795676133955066, + "grad_norm": 1.0129467373354948, + "learning_rate": 7.922673884998291e-06, + "loss": 0.9887, + "step": 3418 + }, + { + "epoch": 0.5797371767698177, + "grad_norm": 0.9490030311047655, + "learning_rate": 7.917301714587968e-06, + "loss": 0.9905, + "step": 3419 + }, + { + "epoch": 0.5799067401441289, + "grad_norm": 0.9402054243692838, + "learning_rate": 7.911930172429865e-06, + "loss": 0.9455, + "step": 3420 + }, + { + "epoch": 0.58007630351844, + "grad_norm": 0.9219371791575302, + "learning_rate": 7.906559260144336e-06, + "loss": 0.9317, + "step": 3421 + }, + { + "epoch": 0.5802458668927511, + "grad_norm": 0.9109996257957591, + "learning_rate": 7.901188979351527e-06, + "loss": 0.9678, + "step": 3422 + }, + { + "epoch": 0.5804154302670623, + "grad_norm": 1.0039971883743508, + "learning_rate": 7.895819331671399e-06, + "loss": 0.9584, + "step": 3423 + }, + { + "epoch": 0.5805849936413735, + "grad_norm": 0.9984601258797432, + "learning_rate": 7.890450318723719e-06, + "loss": 0.9436, + "step": 3424 + }, + { + "epoch": 0.5807545570156846, + "grad_norm": 0.9748430329698677, + "learning_rate": 7.885081942128074e-06, + "loss": 0.9026, + "step": 3425 + }, + { + "epoch": 0.5809241203899957, + "grad_norm": 0.9784752790240956, + "learning_rate": 7.879714203503848e-06, + "loss": 0.9627, + "step": 3426 + }, + { + "epoch": 0.5810936837643069, + "grad_norm": 0.9166479974176474, + "learning_rate": 7.874347104470234e-06, + "loss": 0.9336, + "step": 3427 + }, + { + "epoch": 0.5812632471386181, + "grad_norm": 0.971997209431258, + "learning_rate": 7.868980646646235e-06, + "loss": 0.9851, + "step": 3428 + }, + { + "epoch": 0.5814328105129292, + "grad_norm": 0.9609867211804319, + "learning_rate": 7.863614831650658e-06, + "loss": 0.9524, + "step": 3429 + }, + { + "epoch": 0.5816023738872403, + "grad_norm": 0.9772767084510783, + "learning_rate": 7.858249661102118e-06, + "loss": 0.9761, + "step": 3430 + }, + { + "epoch": 0.5817719372615515, + "grad_norm": 0.9758343214781775, + "learning_rate": 7.852885136619031e-06, + "loss": 0.8919, + "step": 3431 + }, + { + "epoch": 0.5819415006358627, + "grad_norm": 0.6287788493208274, + "learning_rate": 7.84752125981963e-06, + "loss": 0.7989, + "step": 3432 + }, + { + "epoch": 0.5821110640101738, + "grad_norm": 0.9456333877649048, + "learning_rate": 7.84215803232194e-06, + "loss": 0.9407, + "step": 3433 + }, + { + "epoch": 0.5822806273844849, + "grad_norm": 0.997972350404109, + "learning_rate": 7.836795455743796e-06, + "loss": 0.9647, + "step": 3434 + }, + { + "epoch": 0.5824501907587961, + "grad_norm": 1.0016519106105044, + "learning_rate": 7.831433531702831e-06, + "loss": 0.9346, + "step": 3435 + }, + { + "epoch": 0.5826197541331073, + "grad_norm": 0.936976443789542, + "learning_rate": 7.826072261816493e-06, + "loss": 0.9508, + "step": 3436 + }, + { + "epoch": 0.5827893175074184, + "grad_norm": 0.9697365757415702, + "learning_rate": 7.820711647702017e-06, + "loss": 0.9353, + "step": 3437 + }, + { + "epoch": 0.5829588808817295, + "grad_norm": 0.9582611589976441, + "learning_rate": 7.815351690976455e-06, + "loss": 0.9516, + "step": 3438 + }, + { + "epoch": 0.5831284442560407, + "grad_norm": 0.9977573591518735, + "learning_rate": 7.809992393256653e-06, + "loss": 0.9703, + "step": 3439 + }, + { + "epoch": 0.5832980076303519, + "grad_norm": 1.023846616194263, + "learning_rate": 7.804633756159258e-06, + "loss": 0.9761, + "step": 3440 + }, + { + "epoch": 0.583467571004663, + "grad_norm": 0.6184670480474209, + "learning_rate": 7.79927578130072e-06, + "loss": 0.7731, + "step": 3441 + }, + { + "epoch": 0.5836371343789741, + "grad_norm": 0.9555614786776051, + "learning_rate": 7.793918470297284e-06, + "loss": 0.9561, + "step": 3442 + }, + { + "epoch": 0.5838066977532853, + "grad_norm": 0.9161918809795976, + "learning_rate": 7.788561824765007e-06, + "loss": 0.929, + "step": 3443 + }, + { + "epoch": 0.5839762611275965, + "grad_norm": 0.6148405042378875, + "learning_rate": 7.783205846319731e-06, + "loss": 0.7935, + "step": 3444 + }, + { + "epoch": 0.5841458245019076, + "grad_norm": 1.0399815244172013, + "learning_rate": 7.777850536577104e-06, + "loss": 0.9689, + "step": 3445 + }, + { + "epoch": 0.5843153878762187, + "grad_norm": 0.9470515943664412, + "learning_rate": 7.772495897152575e-06, + "loss": 0.9532, + "step": 3446 + }, + { + "epoch": 0.5844849512505299, + "grad_norm": 0.9651051186509068, + "learning_rate": 7.767141929661383e-06, + "loss": 0.9617, + "step": 3447 + }, + { + "epoch": 0.5846545146248411, + "grad_norm": 0.9841204194037927, + "learning_rate": 7.761788635718572e-06, + "loss": 0.9371, + "step": 3448 + }, + { + "epoch": 0.5848240779991521, + "grad_norm": 0.9729482614068703, + "learning_rate": 7.756436016938973e-06, + "loss": 0.9775, + "step": 3449 + }, + { + "epoch": 0.5849936413734633, + "grad_norm": 0.9993275872288293, + "learning_rate": 7.751084074937226e-06, + "loss": 0.963, + "step": 3450 + }, + { + "epoch": 0.5851632047477745, + "grad_norm": 0.9888430517624435, + "learning_rate": 7.74573281132776e-06, + "loss": 0.9536, + "step": 3451 + }, + { + "epoch": 0.5853327681220857, + "grad_norm": 0.9675483848533789, + "learning_rate": 7.740382227724795e-06, + "loss": 0.9654, + "step": 3452 + }, + { + "epoch": 0.5855023314963967, + "grad_norm": 0.99293755890993, + "learning_rate": 7.735032325742355e-06, + "loss": 0.9735, + "step": 3453 + }, + { + "epoch": 0.5856718948707079, + "grad_norm": 0.9298116805603497, + "learning_rate": 7.729683106994256e-06, + "loss": 0.9336, + "step": 3454 + }, + { + "epoch": 0.5858414582450191, + "grad_norm": 0.9525145153091052, + "learning_rate": 7.724334573094101e-06, + "loss": 0.9424, + "step": 3455 + }, + { + "epoch": 0.5860110216193303, + "grad_norm": 0.9654867587034026, + "learning_rate": 7.718986725655293e-06, + "loss": 0.9431, + "step": 3456 + }, + { + "epoch": 0.5861805849936413, + "grad_norm": 0.6362319554138335, + "learning_rate": 7.713639566291028e-06, + "loss": 0.7702, + "step": 3457 + }, + { + "epoch": 0.5863501483679525, + "grad_norm": 0.9777832807707518, + "learning_rate": 7.70829309661429e-06, + "loss": 0.9575, + "step": 3458 + }, + { + "epoch": 0.5865197117422637, + "grad_norm": 0.9572424355479278, + "learning_rate": 7.702947318237862e-06, + "loss": 0.9428, + "step": 3459 + }, + { + "epoch": 0.5866892751165749, + "grad_norm": 0.9440638464761222, + "learning_rate": 7.697602232774304e-06, + "loss": 0.9681, + "step": 3460 + }, + { + "epoch": 0.5868588384908859, + "grad_norm": 0.9874135855687968, + "learning_rate": 7.692257841835992e-06, + "loss": 0.98, + "step": 3461 + }, + { + "epoch": 0.5870284018651971, + "grad_norm": 0.9489967459339086, + "learning_rate": 7.686914147035068e-06, + "loss": 0.9423, + "step": 3462 + }, + { + "epoch": 0.5871979652395083, + "grad_norm": 0.9300036218715391, + "learning_rate": 7.681571149983475e-06, + "loss": 0.9684, + "step": 3463 + }, + { + "epoch": 0.5873675286138195, + "grad_norm": 0.9151042231783431, + "learning_rate": 7.676228852292947e-06, + "loss": 0.9241, + "step": 3464 + }, + { + "epoch": 0.5875370919881305, + "grad_norm": 0.9633808506965221, + "learning_rate": 7.670887255575003e-06, + "loss": 0.9209, + "step": 3465 + }, + { + "epoch": 0.5877066553624417, + "grad_norm": 0.9696190858478899, + "learning_rate": 7.66554636144095e-06, + "loss": 0.9137, + "step": 3466 + }, + { + "epoch": 0.5878762187367529, + "grad_norm": 0.9478376336384198, + "learning_rate": 7.660206171501881e-06, + "loss": 0.9848, + "step": 3467 + }, + { + "epoch": 0.5880457821110641, + "grad_norm": 0.9273514930199466, + "learning_rate": 7.65486668736869e-06, + "loss": 0.9107, + "step": 3468 + }, + { + "epoch": 0.5882153454853751, + "grad_norm": 0.9650211704347943, + "learning_rate": 7.649527910652044e-06, + "loss": 0.9428, + "step": 3469 + }, + { + "epoch": 0.5883849088596863, + "grad_norm": 0.9620438342023812, + "learning_rate": 7.644189842962399e-06, + "loss": 0.9055, + "step": 3470 + }, + { + "epoch": 0.5885544722339975, + "grad_norm": 0.9653572004487774, + "learning_rate": 7.638852485910002e-06, + "loss": 0.9491, + "step": 3471 + }, + { + "epoch": 0.5887240356083087, + "grad_norm": 0.9812444176795981, + "learning_rate": 7.633515841104884e-06, + "loss": 0.9763, + "step": 3472 + }, + { + "epoch": 0.5888935989826197, + "grad_norm": 1.0119365013908592, + "learning_rate": 7.628179910156859e-06, + "loss": 0.9836, + "step": 3473 + }, + { + "epoch": 0.5890631623569309, + "grad_norm": 0.9968362551201371, + "learning_rate": 7.622844694675522e-06, + "loss": 0.9872, + "step": 3474 + }, + { + "epoch": 0.5892327257312421, + "grad_norm": 0.9829620008445998, + "learning_rate": 7.6175101962702624e-06, + "loss": 0.965, + "step": 3475 + }, + { + "epoch": 0.5894022891055531, + "grad_norm": 0.9886391980251944, + "learning_rate": 7.6121764165502476e-06, + "loss": 0.9147, + "step": 3476 + }, + { + "epoch": 0.5895718524798643, + "grad_norm": 0.9657999227095531, + "learning_rate": 7.606843357124426e-06, + "loss": 0.9842, + "step": 3477 + }, + { + "epoch": 0.5897414158541755, + "grad_norm": 0.9317736167987475, + "learning_rate": 7.6015110196015275e-06, + "loss": 0.9195, + "step": 3478 + }, + { + "epoch": 0.5899109792284867, + "grad_norm": 0.9517615658817935, + "learning_rate": 7.596179405590076e-06, + "loss": 0.9352, + "step": 3479 + }, + { + "epoch": 0.5900805426027977, + "grad_norm": 0.9672015545296544, + "learning_rate": 7.590848516698366e-06, + "loss": 0.9606, + "step": 3480 + }, + { + "epoch": 0.5902501059771089, + "grad_norm": 0.9943623372344311, + "learning_rate": 7.585518354534473e-06, + "loss": 0.9884, + "step": 3481 + }, + { + "epoch": 0.5904196693514201, + "grad_norm": 1.0058224721607767, + "learning_rate": 7.580188920706261e-06, + "loss": 0.9779, + "step": 3482 + }, + { + "epoch": 0.5905892327257313, + "grad_norm": 0.9646446057400887, + "learning_rate": 7.574860216821367e-06, + "loss": 0.9615, + "step": 3483 + }, + { + "epoch": 0.5907587961000423, + "grad_norm": 0.9243343781309876, + "learning_rate": 7.569532244487212e-06, + "loss": 0.9081, + "step": 3484 + }, + { + "epoch": 0.5909283594743535, + "grad_norm": 0.9910137168879887, + "learning_rate": 7.56420500531099e-06, + "loss": 0.9756, + "step": 3485 + }, + { + "epoch": 0.5910979228486647, + "grad_norm": 0.939263716384389, + "learning_rate": 7.558878500899687e-06, + "loss": 0.9272, + "step": 3486 + }, + { + "epoch": 0.5912674862229759, + "grad_norm": 0.9931354201892537, + "learning_rate": 7.5535527328600544e-06, + "loss": 1.0047, + "step": 3487 + }, + { + "epoch": 0.5914370495972869, + "grad_norm": 0.9180442723042967, + "learning_rate": 7.548227702798624e-06, + "loss": 0.9387, + "step": 3488 + }, + { + "epoch": 0.5916066129715981, + "grad_norm": 0.9446904704706182, + "learning_rate": 7.542903412321714e-06, + "loss": 0.9478, + "step": 3489 + }, + { + "epoch": 0.5917761763459093, + "grad_norm": 0.9792374085693535, + "learning_rate": 7.537579863035409e-06, + "loss": 0.9573, + "step": 3490 + }, + { + "epoch": 0.5919457397202205, + "grad_norm": 0.9537974691365961, + "learning_rate": 7.532257056545573e-06, + "loss": 0.9547, + "step": 3491 + }, + { + "epoch": 0.5921153030945315, + "grad_norm": 0.9471769078208534, + "learning_rate": 7.5269349944578454e-06, + "loss": 0.9895, + "step": 3492 + }, + { + "epoch": 0.5922848664688427, + "grad_norm": 0.9785249941487371, + "learning_rate": 7.521613678377646e-06, + "loss": 0.9488, + "step": 3493 + }, + { + "epoch": 0.5924544298431539, + "grad_norm": 0.9366792912542713, + "learning_rate": 7.516293109910165e-06, + "loss": 0.9615, + "step": 3494 + }, + { + "epoch": 0.5926239932174651, + "grad_norm": 0.9408330456569416, + "learning_rate": 7.510973290660366e-06, + "loss": 0.9313, + "step": 3495 + }, + { + "epoch": 0.5927935565917761, + "grad_norm": 0.9473936256698582, + "learning_rate": 7.505654222232985e-06, + "loss": 0.9237, + "step": 3496 + }, + { + "epoch": 0.5929631199660873, + "grad_norm": 0.9360998211765088, + "learning_rate": 7.500335906232544e-06, + "loss": 0.9441, + "step": 3497 + }, + { + "epoch": 0.5931326833403985, + "grad_norm": 0.9642416803983946, + "learning_rate": 7.4950183442633255e-06, + "loss": 0.9202, + "step": 3498 + }, + { + "epoch": 0.5933022467147097, + "grad_norm": 1.0380122274660852, + "learning_rate": 7.489701537929384e-06, + "loss": 0.9701, + "step": 3499 + }, + { + "epoch": 0.5934718100890207, + "grad_norm": 0.9693441981521809, + "learning_rate": 7.484385488834556e-06, + "loss": 0.9402, + "step": 3500 + }, + { + "epoch": 0.5936413734633319, + "grad_norm": 1.0057653995038704, + "learning_rate": 7.479070198582441e-06, + "loss": 1.0082, + "step": 3501 + }, + { + "epoch": 0.5938109368376431, + "grad_norm": 0.9763316773725886, + "learning_rate": 7.473755668776413e-06, + "loss": 0.9721, + "step": 3502 + }, + { + "epoch": 0.5939805002119543, + "grad_norm": 0.9376039149867716, + "learning_rate": 7.468441901019612e-06, + "loss": 0.9705, + "step": 3503 + }, + { + "epoch": 0.5941500635862653, + "grad_norm": 0.9308121175258789, + "learning_rate": 7.463128896914958e-06, + "loss": 0.9544, + "step": 3504 + }, + { + "epoch": 0.5943196269605765, + "grad_norm": 0.9196473565331038, + "learning_rate": 7.4578166580651335e-06, + "loss": 0.979, + "step": 3505 + }, + { + "epoch": 0.5944891903348877, + "grad_norm": 0.9379661272557751, + "learning_rate": 7.452505186072585e-06, + "loss": 0.9056, + "step": 3506 + }, + { + "epoch": 0.5946587537091989, + "grad_norm": 0.9553036761263827, + "learning_rate": 7.447194482539544e-06, + "loss": 0.9636, + "step": 3507 + }, + { + "epoch": 0.5948283170835099, + "grad_norm": 0.947992886391597, + "learning_rate": 7.441884549067994e-06, + "loss": 0.9732, + "step": 3508 + }, + { + "epoch": 0.5949978804578211, + "grad_norm": 0.9128820994683076, + "learning_rate": 7.436575387259697e-06, + "loss": 0.9295, + "step": 3509 + }, + { + "epoch": 0.5951674438321323, + "grad_norm": 0.9321470900697977, + "learning_rate": 7.431266998716171e-06, + "loss": 0.9677, + "step": 3510 + }, + { + "epoch": 0.5953370072064434, + "grad_norm": 0.94120561993968, + "learning_rate": 7.425959385038714e-06, + "loss": 0.9341, + "step": 3511 + }, + { + "epoch": 0.5955065705807545, + "grad_norm": 0.9798798426240755, + "learning_rate": 7.4206525478283795e-06, + "loss": 0.9882, + "step": 3512 + }, + { + "epoch": 0.5956761339550657, + "grad_norm": 0.9487395203915984, + "learning_rate": 7.4153464886859925e-06, + "loss": 0.953, + "step": 3513 + }, + { + "epoch": 0.5958456973293769, + "grad_norm": 0.9050942153911852, + "learning_rate": 7.410041209212138e-06, + "loss": 0.8542, + "step": 3514 + }, + { + "epoch": 0.596015260703688, + "grad_norm": 0.9571600132591642, + "learning_rate": 7.404736711007176e-06, + "loss": 0.9751, + "step": 3515 + }, + { + "epoch": 0.5961848240779991, + "grad_norm": 0.9626870225768668, + "learning_rate": 7.399432995671223e-06, + "loss": 0.9102, + "step": 3516 + }, + { + "epoch": 0.5963543874523103, + "grad_norm": 0.9577868520563464, + "learning_rate": 7.394130064804157e-06, + "loss": 0.9327, + "step": 3517 + }, + { + "epoch": 0.5965239508266215, + "grad_norm": 0.6220022210084684, + "learning_rate": 7.388827920005628e-06, + "loss": 0.7842, + "step": 3518 + }, + { + "epoch": 0.5966935142009326, + "grad_norm": 1.002680295034151, + "learning_rate": 7.383526562875041e-06, + "loss": 0.9499, + "step": 3519 + }, + { + "epoch": 0.5968630775752437, + "grad_norm": 0.700044317387652, + "learning_rate": 7.378225995011566e-06, + "loss": 0.8501, + "step": 3520 + }, + { + "epoch": 0.5970326409495549, + "grad_norm": 1.0155864390506473, + "learning_rate": 7.372926218014131e-06, + "loss": 0.9637, + "step": 3521 + }, + { + "epoch": 0.5972022043238661, + "grad_norm": 0.9995842492263824, + "learning_rate": 7.36762723348144e-06, + "loss": 0.9321, + "step": 3522 + }, + { + "epoch": 0.5973717676981772, + "grad_norm": 0.9964402228277096, + "learning_rate": 7.362329043011942e-06, + "loss": 0.9628, + "step": 3523 + }, + { + "epoch": 0.5975413310724883, + "grad_norm": 0.973107619583477, + "learning_rate": 7.357031648203849e-06, + "loss": 0.9173, + "step": 3524 + }, + { + "epoch": 0.5977108944467995, + "grad_norm": 0.9504310158845471, + "learning_rate": 7.3517350506551446e-06, + "loss": 0.9759, + "step": 3525 + }, + { + "epoch": 0.5978804578211107, + "grad_norm": 0.9288831038605503, + "learning_rate": 7.3464392519635574e-06, + "loss": 0.9298, + "step": 3526 + }, + { + "epoch": 0.5980500211954218, + "grad_norm": 0.9407148834422668, + "learning_rate": 7.341144253726583e-06, + "loss": 0.9067, + "step": 3527 + }, + { + "epoch": 0.5982195845697329, + "grad_norm": 0.6368165589102066, + "learning_rate": 7.335850057541471e-06, + "loss": 0.8465, + "step": 3528 + }, + { + "epoch": 0.5983891479440441, + "grad_norm": 1.03598637777532, + "learning_rate": 7.330556665005235e-06, + "loss": 0.9748, + "step": 3529 + }, + { + "epoch": 0.5985587113183553, + "grad_norm": 0.9470462007610989, + "learning_rate": 7.32526407771464e-06, + "loss": 0.9817, + "step": 3530 + }, + { + "epoch": 0.5987282746926664, + "grad_norm": 0.9543173026138349, + "learning_rate": 7.319972297266215e-06, + "loss": 0.9706, + "step": 3531 + }, + { + "epoch": 0.5988978380669775, + "grad_norm": 0.9836449557475946, + "learning_rate": 7.314681325256232e-06, + "loss": 0.9891, + "step": 3532 + }, + { + "epoch": 0.5990674014412887, + "grad_norm": 0.9640791457136666, + "learning_rate": 7.3093911632807415e-06, + "loss": 0.9256, + "step": 3533 + }, + { + "epoch": 0.5992369648155998, + "grad_norm": 0.886457298695005, + "learning_rate": 7.304101812935531e-06, + "loss": 0.9149, + "step": 3534 + }, + { + "epoch": 0.599406528189911, + "grad_norm": 0.6402390491174929, + "learning_rate": 7.298813275816144e-06, + "loss": 0.8033, + "step": 3535 + }, + { + "epoch": 0.5995760915642221, + "grad_norm": 1.0605793596147146, + "learning_rate": 7.2935255535178924e-06, + "loss": 0.9962, + "step": 3536 + }, + { + "epoch": 0.5997456549385333, + "grad_norm": 0.9758429926682303, + "learning_rate": 7.2882386476358304e-06, + "loss": 0.9599, + "step": 3537 + }, + { + "epoch": 0.5999152183128444, + "grad_norm": 0.92878277910036, + "learning_rate": 7.282952559764769e-06, + "loss": 0.9392, + "step": 3538 + }, + { + "epoch": 0.6000847816871556, + "grad_norm": 0.9773764245492319, + "learning_rate": 7.277667291499268e-06, + "loss": 0.9404, + "step": 3539 + }, + { + "epoch": 0.6002543450614667, + "grad_norm": 0.9822468601188052, + "learning_rate": 7.272382844433653e-06, + "loss": 0.9564, + "step": 3540 + }, + { + "epoch": 0.6004239084357779, + "grad_norm": 0.9603553481852201, + "learning_rate": 7.267099220161989e-06, + "loss": 0.9267, + "step": 3541 + }, + { + "epoch": 0.600593471810089, + "grad_norm": 0.946922397392502, + "learning_rate": 7.2618164202780914e-06, + "loss": 0.9798, + "step": 3542 + }, + { + "epoch": 0.6007630351844002, + "grad_norm": 1.0062277905823154, + "learning_rate": 7.256534446375543e-06, + "loss": 0.968, + "step": 3543 + }, + { + "epoch": 0.6009325985587113, + "grad_norm": 0.9764568237814507, + "learning_rate": 7.2512533000476625e-06, + "loss": 0.9352, + "step": 3544 + }, + { + "epoch": 0.6011021619330225, + "grad_norm": 0.9952745038344479, + "learning_rate": 7.2459729828875256e-06, + "loss": 0.9283, + "step": 3545 + }, + { + "epoch": 0.6012717253073336, + "grad_norm": 1.0003705071359748, + "learning_rate": 7.24069349648795e-06, + "loss": 1.0048, + "step": 3546 + }, + { + "epoch": 0.6014412886816448, + "grad_norm": 0.9653270068319545, + "learning_rate": 7.235414842441517e-06, + "loss": 0.9687, + "step": 3547 + }, + { + "epoch": 0.6016108520559559, + "grad_norm": 1.008970256053457, + "learning_rate": 7.230137022340542e-06, + "loss": 0.9409, + "step": 3548 + }, + { + "epoch": 0.6017804154302671, + "grad_norm": 0.955598642345728, + "learning_rate": 7.224860037777095e-06, + "loss": 0.9455, + "step": 3549 + }, + { + "epoch": 0.6019499788045782, + "grad_norm": 0.9643830385379727, + "learning_rate": 7.219583890343003e-06, + "loss": 0.9712, + "step": 3550 + }, + { + "epoch": 0.6021195421788894, + "grad_norm": 0.9718318222999374, + "learning_rate": 7.2143085816298234e-06, + "loss": 0.9707, + "step": 3551 + }, + { + "epoch": 0.6022891055532005, + "grad_norm": 0.9547241384819419, + "learning_rate": 7.209034113228872e-06, + "loss": 0.9348, + "step": 3552 + }, + { + "epoch": 0.6024586689275117, + "grad_norm": 0.9400110782529516, + "learning_rate": 7.203760486731204e-06, + "loss": 0.9085, + "step": 3553 + }, + { + "epoch": 0.6026282323018228, + "grad_norm": 0.989651690332263, + "learning_rate": 7.198487703727632e-06, + "loss": 0.982, + "step": 3554 + }, + { + "epoch": 0.602797795676134, + "grad_norm": 0.9426314691646828, + "learning_rate": 7.193215765808703e-06, + "loss": 0.9051, + "step": 3555 + }, + { + "epoch": 0.6029673590504451, + "grad_norm": 1.0109609886761874, + "learning_rate": 7.1879446745647155e-06, + "loss": 0.9754, + "step": 3556 + }, + { + "epoch": 0.6031369224247562, + "grad_norm": 0.9481125813550529, + "learning_rate": 7.182674431585703e-06, + "loss": 0.9607, + "step": 3557 + }, + { + "epoch": 0.6033064857990674, + "grad_norm": 0.9624245788500905, + "learning_rate": 7.177405038461459e-06, + "loss": 0.9622, + "step": 3558 + }, + { + "epoch": 0.6034760491733786, + "grad_norm": 0.9577243135413294, + "learning_rate": 7.172136496781508e-06, + "loss": 0.9662, + "step": 3559 + }, + { + "epoch": 0.6036456125476897, + "grad_norm": 0.9569751644935419, + "learning_rate": 7.1668688081351164e-06, + "loss": 0.9521, + "step": 3560 + }, + { + "epoch": 0.6038151759220008, + "grad_norm": 0.9360734253227487, + "learning_rate": 7.161601974111308e-06, + "loss": 0.9282, + "step": 3561 + }, + { + "epoch": 0.603984739296312, + "grad_norm": 0.674245913055902, + "learning_rate": 7.156335996298834e-06, + "loss": 0.8913, + "step": 3562 + }, + { + "epoch": 0.6041543026706232, + "grad_norm": 0.9182468337016708, + "learning_rate": 7.1510708762861945e-06, + "loss": 0.885, + "step": 3563 + }, + { + "epoch": 0.6043238660449343, + "grad_norm": 0.9464453060337928, + "learning_rate": 7.1458066156616244e-06, + "loss": 0.9446, + "step": 3564 + }, + { + "epoch": 0.6044934294192454, + "grad_norm": 0.9718076567862831, + "learning_rate": 7.140543216013109e-06, + "loss": 0.963, + "step": 3565 + }, + { + "epoch": 0.6046629927935566, + "grad_norm": 0.9878707271699002, + "learning_rate": 7.1352806789283664e-06, + "loss": 0.9619, + "step": 3566 + }, + { + "epoch": 0.6048325561678677, + "grad_norm": 0.9524419406485947, + "learning_rate": 7.1300190059948535e-06, + "loss": 0.9449, + "step": 3567 + }, + { + "epoch": 0.6050021195421789, + "grad_norm": 0.9651545667980798, + "learning_rate": 7.124758198799777e-06, + "loss": 0.9486, + "step": 3568 + }, + { + "epoch": 0.60517168291649, + "grad_norm": 0.9105105348367812, + "learning_rate": 7.119498258930073e-06, + "loss": 0.926, + "step": 3569 + }, + { + "epoch": 0.6053412462908012, + "grad_norm": 0.9476184587646148, + "learning_rate": 7.114239187972416e-06, + "loss": 0.9442, + "step": 3570 + }, + { + "epoch": 0.6055108096651123, + "grad_norm": 0.9779200212275795, + "learning_rate": 7.108980987513216e-06, + "loss": 0.9178, + "step": 3571 + }, + { + "epoch": 0.6056803730394235, + "grad_norm": 0.9133960221742026, + "learning_rate": 7.103723659138636e-06, + "loss": 0.9303, + "step": 3572 + }, + { + "epoch": 0.6058499364137346, + "grad_norm": 0.9901965390411374, + "learning_rate": 7.098467204434559e-06, + "loss": 0.9531, + "step": 3573 + }, + { + "epoch": 0.6060194997880458, + "grad_norm": 0.6228061685981762, + "learning_rate": 7.093211624986611e-06, + "loss": 0.8017, + "step": 3574 + }, + { + "epoch": 0.6061890631623569, + "grad_norm": 0.9500725921351076, + "learning_rate": 7.0879569223801526e-06, + "loss": 0.9554, + "step": 3575 + }, + { + "epoch": 0.606358626536668, + "grad_norm": 1.0305642970489943, + "learning_rate": 7.082703098200282e-06, + "loss": 0.9632, + "step": 3576 + }, + { + "epoch": 0.6065281899109792, + "grad_norm": 0.9367878114866866, + "learning_rate": 7.0774501540318305e-06, + "loss": 0.9479, + "step": 3577 + }, + { + "epoch": 0.6066977532852904, + "grad_norm": 0.9729790581374097, + "learning_rate": 7.072198091459361e-06, + "loss": 0.9272, + "step": 3578 + }, + { + "epoch": 0.6068673166596015, + "grad_norm": 0.9644565090309974, + "learning_rate": 7.0669469120671815e-06, + "loss": 0.927, + "step": 3579 + }, + { + "epoch": 0.6070368800339127, + "grad_norm": 0.9970790373118575, + "learning_rate": 7.061696617439323e-06, + "loss": 0.958, + "step": 3580 + }, + { + "epoch": 0.6072064434082238, + "grad_norm": 0.9746233992125029, + "learning_rate": 7.056447209159552e-06, + "loss": 0.9373, + "step": 3581 + }, + { + "epoch": 0.607376006782535, + "grad_norm": 0.9977839208095215, + "learning_rate": 7.051198688811366e-06, + "loss": 0.9699, + "step": 3582 + }, + { + "epoch": 0.6075455701568461, + "grad_norm": 0.9846190006746545, + "learning_rate": 7.045951057978001e-06, + "loss": 0.9312, + "step": 3583 + }, + { + "epoch": 0.6077151335311572, + "grad_norm": 0.9778400612359939, + "learning_rate": 7.040704318242419e-06, + "loss": 0.968, + "step": 3584 + }, + { + "epoch": 0.6078846969054684, + "grad_norm": 0.9940171253107976, + "learning_rate": 7.035458471187312e-06, + "loss": 0.9526, + "step": 3585 + }, + { + "epoch": 0.6080542602797796, + "grad_norm": 0.9741839062136622, + "learning_rate": 7.030213518395112e-06, + "loss": 0.9508, + "step": 3586 + }, + { + "epoch": 0.6082238236540907, + "grad_norm": 0.9635571456769719, + "learning_rate": 7.024969461447973e-06, + "loss": 0.8941, + "step": 3587 + }, + { + "epoch": 0.6083933870284018, + "grad_norm": 0.890560656184682, + "learning_rate": 7.019726301927776e-06, + "loss": 0.9563, + "step": 3588 + }, + { + "epoch": 0.608562950402713, + "grad_norm": 0.9266934141860115, + "learning_rate": 7.014484041416137e-06, + "loss": 0.9279, + "step": 3589 + }, + { + "epoch": 0.6087325137770242, + "grad_norm": 0.9975322238544472, + "learning_rate": 7.0092426814944045e-06, + "loss": 0.9813, + "step": 3590 + }, + { + "epoch": 0.6089020771513353, + "grad_norm": 0.9983275311633019, + "learning_rate": 7.004002223743649e-06, + "loss": 0.9249, + "step": 3591 + }, + { + "epoch": 0.6090716405256464, + "grad_norm": 0.9586797655327807, + "learning_rate": 6.998762669744668e-06, + "loss": 0.9063, + "step": 3592 + }, + { + "epoch": 0.6092412038999576, + "grad_norm": 0.9771303178769589, + "learning_rate": 6.993524021077989e-06, + "loss": 0.9641, + "step": 3593 + }, + { + "epoch": 0.6094107672742688, + "grad_norm": 0.9108438938382009, + "learning_rate": 6.9882862793238685e-06, + "loss": 0.8488, + "step": 3594 + }, + { + "epoch": 0.6095803306485799, + "grad_norm": 1.006793036955104, + "learning_rate": 6.983049446062285e-06, + "loss": 0.8998, + "step": 3595 + }, + { + "epoch": 0.609749894022891, + "grad_norm": 0.9408193042652288, + "learning_rate": 6.977813522872943e-06, + "loss": 0.9686, + "step": 3596 + }, + { + "epoch": 0.6099194573972022, + "grad_norm": 0.9272812594944455, + "learning_rate": 6.97257851133528e-06, + "loss": 0.9274, + "step": 3597 + }, + { + "epoch": 0.6100890207715134, + "grad_norm": 0.7234740418392058, + "learning_rate": 6.967344413028452e-06, + "loss": 0.8403, + "step": 3598 + }, + { + "epoch": 0.6102585841458245, + "grad_norm": 0.9991221948741854, + "learning_rate": 6.962111229531337e-06, + "loss": 0.9329, + "step": 3599 + }, + { + "epoch": 0.6104281475201356, + "grad_norm": 1.04734634364071, + "learning_rate": 6.9568789624225415e-06, + "loss": 0.9653, + "step": 3600 + }, + { + "epoch": 0.6105977108944468, + "grad_norm": 1.0261358786311874, + "learning_rate": 6.951647613280397e-06, + "loss": 0.9523, + "step": 3601 + }, + { + "epoch": 0.610767274268758, + "grad_norm": 0.9952531994056946, + "learning_rate": 6.946417183682955e-06, + "loss": 0.9452, + "step": 3602 + }, + { + "epoch": 0.610936837643069, + "grad_norm": 0.6898001366740375, + "learning_rate": 6.9411876752079856e-06, + "loss": 0.8044, + "step": 3603 + }, + { + "epoch": 0.6111064010173802, + "grad_norm": 1.008820707094296, + "learning_rate": 6.935959089432995e-06, + "loss": 0.9821, + "step": 3604 + }, + { + "epoch": 0.6112759643916914, + "grad_norm": 1.0293091512956585, + "learning_rate": 6.930731427935196e-06, + "loss": 0.9675, + "step": 3605 + }, + { + "epoch": 0.6114455277660026, + "grad_norm": 0.9386580613482086, + "learning_rate": 6.925504692291529e-06, + "loss": 0.9369, + "step": 3606 + }, + { + "epoch": 0.6116150911403136, + "grad_norm": 0.9274281634066788, + "learning_rate": 6.920278884078652e-06, + "loss": 0.9908, + "step": 3607 + }, + { + "epoch": 0.6117846545146248, + "grad_norm": 0.9620406217875858, + "learning_rate": 6.915054004872952e-06, + "loss": 0.9485, + "step": 3608 + }, + { + "epoch": 0.611954217888936, + "grad_norm": 0.9241775384908139, + "learning_rate": 6.909830056250527e-06, + "loss": 0.9171, + "step": 3609 + }, + { + "epoch": 0.6121237812632472, + "grad_norm": 0.9545577391552628, + "learning_rate": 6.904607039787197e-06, + "loss": 0.9406, + "step": 3610 + }, + { + "epoch": 0.6122933446375582, + "grad_norm": 0.9870401069288942, + "learning_rate": 6.899384957058496e-06, + "loss": 0.9617, + "step": 3611 + }, + { + "epoch": 0.6124629080118694, + "grad_norm": 0.9223329815743532, + "learning_rate": 6.894163809639688e-06, + "loss": 0.9266, + "step": 3612 + }, + { + "epoch": 0.6126324713861806, + "grad_norm": 0.9754436645170015, + "learning_rate": 6.888943599105745e-06, + "loss": 0.9728, + "step": 3613 + }, + { + "epoch": 0.6128020347604918, + "grad_norm": 1.0075120743778265, + "learning_rate": 6.883724327031355e-06, + "loss": 0.9781, + "step": 3614 + }, + { + "epoch": 0.6129715981348028, + "grad_norm": 0.9641327253741406, + "learning_rate": 6.878505994990935e-06, + "loss": 0.9183, + "step": 3615 + }, + { + "epoch": 0.613141161509114, + "grad_norm": 0.9610536889133187, + "learning_rate": 6.873288604558608e-06, + "loss": 0.9811, + "step": 3616 + }, + { + "epoch": 0.6133107248834252, + "grad_norm": 0.9535351515072967, + "learning_rate": 6.868072157308213e-06, + "loss": 0.9441, + "step": 3617 + }, + { + "epoch": 0.6134802882577364, + "grad_norm": 0.9000106944239568, + "learning_rate": 6.862856654813308e-06, + "loss": 0.9307, + "step": 3618 + }, + { + "epoch": 0.6136498516320474, + "grad_norm": 1.0117360581842432, + "learning_rate": 6.857642098647165e-06, + "loss": 0.9564, + "step": 3619 + }, + { + "epoch": 0.6138194150063586, + "grad_norm": 0.9827183689143366, + "learning_rate": 6.852428490382773e-06, + "loss": 0.9955, + "step": 3620 + }, + { + "epoch": 0.6139889783806698, + "grad_norm": 0.9272703313445352, + "learning_rate": 6.84721583159283e-06, + "loss": 0.9871, + "step": 3621 + }, + { + "epoch": 0.614158541754981, + "grad_norm": 0.9183923895907284, + "learning_rate": 6.8420041238497525e-06, + "loss": 0.9085, + "step": 3622 + }, + { + "epoch": 0.614328105129292, + "grad_norm": 0.6123462227947196, + "learning_rate": 6.836793368725666e-06, + "loss": 0.771, + "step": 3623 + }, + { + "epoch": 0.6144976685036032, + "grad_norm": 0.9668384833623501, + "learning_rate": 6.831583567792411e-06, + "loss": 0.917, + "step": 3624 + }, + { + "epoch": 0.6146672318779144, + "grad_norm": 0.9568118178707554, + "learning_rate": 6.826374722621536e-06, + "loss": 0.9616, + "step": 3625 + }, + { + "epoch": 0.6148367952522256, + "grad_norm": 0.9586749034607601, + "learning_rate": 6.821166834784314e-06, + "loss": 0.9443, + "step": 3626 + }, + { + "epoch": 0.6150063586265366, + "grad_norm": 0.9338314815642756, + "learning_rate": 6.815959905851715e-06, + "loss": 0.978, + "step": 3627 + }, + { + "epoch": 0.6151759220008478, + "grad_norm": 0.9624952336640717, + "learning_rate": 6.810753937394423e-06, + "loss": 0.974, + "step": 3628 + }, + { + "epoch": 0.615345485375159, + "grad_norm": 0.9289002363802685, + "learning_rate": 6.805548930982832e-06, + "loss": 0.9454, + "step": 3629 + }, + { + "epoch": 0.6155150487494702, + "grad_norm": 0.9173123578164978, + "learning_rate": 6.800344888187057e-06, + "loss": 0.9547, + "step": 3630 + }, + { + "epoch": 0.6156846121237812, + "grad_norm": 0.9304488996276987, + "learning_rate": 6.795141810576906e-06, + "loss": 0.9527, + "step": 3631 + }, + { + "epoch": 0.6158541754980924, + "grad_norm": 0.9496860226046983, + "learning_rate": 6.789939699721902e-06, + "loss": 0.968, + "step": 3632 + }, + { + "epoch": 0.6160237388724036, + "grad_norm": 0.9856173160167246, + "learning_rate": 6.784738557191284e-06, + "loss": 0.9762, + "step": 3633 + }, + { + "epoch": 0.6161933022467148, + "grad_norm": 0.9688039043331194, + "learning_rate": 6.779538384553989e-06, + "loss": 0.9304, + "step": 3634 + }, + { + "epoch": 0.6163628656210258, + "grad_norm": 0.9576583183590159, + "learning_rate": 6.774339183378663e-06, + "loss": 0.9714, + "step": 3635 + }, + { + "epoch": 0.616532428995337, + "grad_norm": 1.1254625387537205, + "learning_rate": 6.76914095523366e-06, + "loss": 0.9434, + "step": 3636 + }, + { + "epoch": 0.6167019923696482, + "grad_norm": 1.0224959548437016, + "learning_rate": 6.763943701687046e-06, + "loss": 0.9773, + "step": 3637 + }, + { + "epoch": 0.6168715557439594, + "grad_norm": 0.9329769612350408, + "learning_rate": 6.758747424306586e-06, + "loss": 0.9547, + "step": 3638 + }, + { + "epoch": 0.6170411191182704, + "grad_norm": 0.9703981089873205, + "learning_rate": 6.75355212465975e-06, + "loss": 0.8931, + "step": 3639 + }, + { + "epoch": 0.6172106824925816, + "grad_norm": 0.9166830173951005, + "learning_rate": 6.748357804313721e-06, + "loss": 0.9425, + "step": 3640 + }, + { + "epoch": 0.6173802458668928, + "grad_norm": 1.0127858287781513, + "learning_rate": 6.7431644648353785e-06, + "loss": 0.9252, + "step": 3641 + }, + { + "epoch": 0.617549809241204, + "grad_norm": 0.9372534631539996, + "learning_rate": 6.7379721077913095e-06, + "loss": 0.9554, + "step": 3642 + }, + { + "epoch": 0.617719372615515, + "grad_norm": 0.9751211374832992, + "learning_rate": 6.732780734747799e-06, + "loss": 0.93, + "step": 3643 + }, + { + "epoch": 0.6178889359898262, + "grad_norm": 1.0002749612993924, + "learning_rate": 6.727590347270849e-06, + "loss": 0.96, + "step": 3644 + }, + { + "epoch": 0.6180584993641374, + "grad_norm": 0.9561169569595386, + "learning_rate": 6.7224009469261535e-06, + "loss": 0.9377, + "step": 3645 + }, + { + "epoch": 0.6182280627384485, + "grad_norm": 0.9645597845785824, + "learning_rate": 6.717212535279108e-06, + "loss": 0.9162, + "step": 3646 + }, + { + "epoch": 0.6183976261127596, + "grad_norm": 0.9751821897681725, + "learning_rate": 6.712025113894811e-06, + "loss": 0.9768, + "step": 3647 + }, + { + "epoch": 0.6185671894870708, + "grad_norm": 0.9154662572808107, + "learning_rate": 6.7068386843380695e-06, + "loss": 0.9121, + "step": 3648 + }, + { + "epoch": 0.618736752861382, + "grad_norm": 0.9682449940877177, + "learning_rate": 6.701653248173382e-06, + "loss": 0.9334, + "step": 3649 + }, + { + "epoch": 0.6189063162356931, + "grad_norm": 0.9455077850444994, + "learning_rate": 6.6964688069649474e-06, + "loss": 0.9569, + "step": 3650 + }, + { + "epoch": 0.6190758796100042, + "grad_norm": 0.9238000350169707, + "learning_rate": 6.691285362276676e-06, + "loss": 0.9491, + "step": 3651 + }, + { + "epoch": 0.6192454429843154, + "grad_norm": 0.9613940991210574, + "learning_rate": 6.6861029156721654e-06, + "loss": 0.9568, + "step": 3652 + }, + { + "epoch": 0.6194150063586266, + "grad_norm": 0.9092712262020262, + "learning_rate": 6.680921468714718e-06, + "loss": 0.928, + "step": 3653 + }, + { + "epoch": 0.6195845697329377, + "grad_norm": 0.9493851312472502, + "learning_rate": 6.675741022967327e-06, + "loss": 0.9455, + "step": 3654 + }, + { + "epoch": 0.6197541331072488, + "grad_norm": 0.9502381988785242, + "learning_rate": 6.670561579992698e-06, + "loss": 0.937, + "step": 3655 + }, + { + "epoch": 0.61992369648156, + "grad_norm": 0.939589242814108, + "learning_rate": 6.665383141353221e-06, + "loss": 0.9453, + "step": 3656 + }, + { + "epoch": 0.6200932598558712, + "grad_norm": 0.955136883357873, + "learning_rate": 6.660205708610987e-06, + "loss": 0.9619, + "step": 3657 + }, + { + "epoch": 0.6202628232301822, + "grad_norm": 0.9651081783025447, + "learning_rate": 6.655029283327788e-06, + "loss": 0.9422, + "step": 3658 + }, + { + "epoch": 0.6204323866044934, + "grad_norm": 0.9149744042302325, + "learning_rate": 6.649853867065104e-06, + "loss": 0.9522, + "step": 3659 + }, + { + "epoch": 0.6206019499788046, + "grad_norm": 0.9490884512939449, + "learning_rate": 6.644679461384117e-06, + "loss": 0.9134, + "step": 3660 + }, + { + "epoch": 0.6207715133531158, + "grad_norm": 0.9875114999940674, + "learning_rate": 6.639506067845698e-06, + "loss": 0.9844, + "step": 3661 + }, + { + "epoch": 0.6209410767274268, + "grad_norm": 0.9660246158846859, + "learning_rate": 6.634333688010426e-06, + "loss": 0.9427, + "step": 3662 + }, + { + "epoch": 0.621110640101738, + "grad_norm": 1.0088895898344516, + "learning_rate": 6.629162323438558e-06, + "loss": 0.9807, + "step": 3663 + }, + { + "epoch": 0.6212802034760492, + "grad_norm": 1.002968149854612, + "learning_rate": 6.623991975690051e-06, + "loss": 0.9528, + "step": 3664 + }, + { + "epoch": 0.6214497668503604, + "grad_norm": 1.0848164228137362, + "learning_rate": 6.618822646324563e-06, + "loss": 0.983, + "step": 3665 + }, + { + "epoch": 0.6216193302246714, + "grad_norm": 0.9754436523248472, + "learning_rate": 6.613654336901431e-06, + "loss": 0.9451, + "step": 3666 + }, + { + "epoch": 0.6217888935989826, + "grad_norm": 0.9777425024532083, + "learning_rate": 6.608487048979695e-06, + "loss": 0.9353, + "step": 3667 + }, + { + "epoch": 0.6219584569732938, + "grad_norm": 0.9860140172466118, + "learning_rate": 6.603320784118075e-06, + "loss": 0.9362, + "step": 3668 + }, + { + "epoch": 0.622128020347605, + "grad_norm": 0.9834874902426562, + "learning_rate": 6.598155543875002e-06, + "loss": 0.9197, + "step": 3669 + }, + { + "epoch": 0.622297583721916, + "grad_norm": 0.9394882901158351, + "learning_rate": 6.5929913298085815e-06, + "loss": 0.9187, + "step": 3670 + }, + { + "epoch": 0.6224671470962272, + "grad_norm": 0.9607314021628965, + "learning_rate": 6.5878281434766136e-06, + "loss": 0.962, + "step": 3671 + }, + { + "epoch": 0.6226367104705384, + "grad_norm": 0.9559558637460268, + "learning_rate": 6.582665986436585e-06, + "loss": 0.9378, + "step": 3672 + }, + { + "epoch": 0.6228062738448495, + "grad_norm": 1.0090440876797908, + "learning_rate": 6.577504860245684e-06, + "loss": 0.9745, + "step": 3673 + }, + { + "epoch": 0.6229758372191606, + "grad_norm": 0.9831311968800052, + "learning_rate": 6.572344766460776e-06, + "loss": 0.9582, + "step": 3674 + }, + { + "epoch": 0.6231454005934718, + "grad_norm": 0.6714470899732261, + "learning_rate": 6.567185706638417e-06, + "loss": 0.7965, + "step": 3675 + }, + { + "epoch": 0.623314963967783, + "grad_norm": 0.9500549126118222, + "learning_rate": 6.562027682334857e-06, + "loss": 0.9638, + "step": 3676 + }, + { + "epoch": 0.6234845273420941, + "grad_norm": 1.0170718413224733, + "learning_rate": 6.556870695106028e-06, + "loss": 0.9334, + "step": 3677 + }, + { + "epoch": 0.6236540907164052, + "grad_norm": 0.9443914976884543, + "learning_rate": 6.55171474650755e-06, + "loss": 0.9419, + "step": 3678 + }, + { + "epoch": 0.6238236540907164, + "grad_norm": 0.9765768040988373, + "learning_rate": 6.5465598380947274e-06, + "loss": 0.9886, + "step": 3679 + }, + { + "epoch": 0.6239932174650276, + "grad_norm": 0.9884567637502714, + "learning_rate": 6.5414059714225605e-06, + "loss": 0.9501, + "step": 3680 + }, + { + "epoch": 0.6241627808393387, + "grad_norm": 0.9509851284529884, + "learning_rate": 6.536253148045726e-06, + "loss": 0.9466, + "step": 3681 + }, + { + "epoch": 0.6243323442136498, + "grad_norm": 1.0311171820536678, + "learning_rate": 6.531101369518585e-06, + "loss": 0.9554, + "step": 3682 + }, + { + "epoch": 0.624501907587961, + "grad_norm": 0.9602720873650847, + "learning_rate": 6.525950637395193e-06, + "loss": 0.9239, + "step": 3683 + }, + { + "epoch": 0.6246714709622722, + "grad_norm": 0.9025769748096363, + "learning_rate": 6.520800953229282e-06, + "loss": 0.9153, + "step": 3684 + }, + { + "epoch": 0.6248410343365833, + "grad_norm": 0.9088540058686904, + "learning_rate": 6.515652318574268e-06, + "loss": 0.9289, + "step": 3685 + }, + { + "epoch": 0.6250105977108944, + "grad_norm": 0.9667571334743007, + "learning_rate": 6.51050473498325e-06, + "loss": 0.9482, + "step": 3686 + }, + { + "epoch": 0.6251801610852056, + "grad_norm": 0.9764592923428709, + "learning_rate": 6.505358204009018e-06, + "loss": 0.9552, + "step": 3687 + }, + { + "epoch": 0.6253497244595168, + "grad_norm": 1.003009845536553, + "learning_rate": 6.500212727204036e-06, + "loss": 0.9623, + "step": 3688 + }, + { + "epoch": 0.6255192878338279, + "grad_norm": 0.9716172181982657, + "learning_rate": 6.495068306120452e-06, + "loss": 0.9309, + "step": 3689 + }, + { + "epoch": 0.625688851208139, + "grad_norm": 0.9571644612825456, + "learning_rate": 6.489924942310093e-06, + "loss": 0.9407, + "step": 3690 + }, + { + "epoch": 0.6258584145824502, + "grad_norm": 0.9223358918499669, + "learning_rate": 6.484782637324479e-06, + "loss": 0.918, + "step": 3691 + }, + { + "epoch": 0.6260279779567614, + "grad_norm": 1.0040721141980677, + "learning_rate": 6.479641392714795e-06, + "loss": 0.9695, + "step": 3692 + }, + { + "epoch": 0.6261975413310725, + "grad_norm": 0.9771888732023392, + "learning_rate": 6.474501210031914e-06, + "loss": 0.9235, + "step": 3693 + }, + { + "epoch": 0.6263671047053836, + "grad_norm": 0.967482033057149, + "learning_rate": 6.469362090826389e-06, + "loss": 0.9421, + "step": 3694 + }, + { + "epoch": 0.6265366680796948, + "grad_norm": 0.6666600352634424, + "learning_rate": 6.46422403664845e-06, + "loss": 0.8363, + "step": 3695 + }, + { + "epoch": 0.626706231454006, + "grad_norm": 1.0099654352019811, + "learning_rate": 6.459087049048007e-06, + "loss": 0.9238, + "step": 3696 + }, + { + "epoch": 0.6268757948283171, + "grad_norm": 0.9818404414423542, + "learning_rate": 6.453951129574644e-06, + "loss": 0.9493, + "step": 3697 + }, + { + "epoch": 0.6270453582026282, + "grad_norm": 0.9841192499302777, + "learning_rate": 6.448816279777633e-06, + "loss": 0.9713, + "step": 3698 + }, + { + "epoch": 0.6272149215769394, + "grad_norm": 0.9840368417943546, + "learning_rate": 6.443682501205914e-06, + "loss": 0.9363, + "step": 3699 + }, + { + "epoch": 0.6273844849512505, + "grad_norm": 0.9536547576356458, + "learning_rate": 6.438549795408107e-06, + "loss": 0.9434, + "step": 3700 + }, + { + "epoch": 0.6275540483255617, + "grad_norm": 0.9459360773508195, + "learning_rate": 6.433418163932508e-06, + "loss": 0.9685, + "step": 3701 + }, + { + "epoch": 0.6277236116998728, + "grad_norm": 0.9547681146357571, + "learning_rate": 6.428287608327088e-06, + "loss": 0.9433, + "step": 3702 + }, + { + "epoch": 0.627893175074184, + "grad_norm": 0.8781367843593298, + "learning_rate": 6.4231581301394954e-06, + "loss": 0.8929, + "step": 3703 + }, + { + "epoch": 0.6280627384484951, + "grad_norm": 0.9778096975232439, + "learning_rate": 6.418029730917052e-06, + "loss": 0.9581, + "step": 3704 + }, + { + "epoch": 0.6282323018228063, + "grad_norm": 0.9345090889656249, + "learning_rate": 6.41290241220676e-06, + "loss": 0.946, + "step": 3705 + }, + { + "epoch": 0.6284018651971174, + "grad_norm": 0.9558259234034503, + "learning_rate": 6.407776175555285e-06, + "loss": 0.9359, + "step": 3706 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.9989021828966471, + "learning_rate": 6.402651022508975e-06, + "loss": 0.9281, + "step": 3707 + }, + { + "epoch": 0.6287409919457397, + "grad_norm": 0.9518018291265377, + "learning_rate": 6.39752695461384e-06, + "loss": 0.9286, + "step": 3708 + }, + { + "epoch": 0.6289105553200509, + "grad_norm": 0.968957747409856, + "learning_rate": 6.392403973415582e-06, + "loss": 0.9014, + "step": 3709 + }, + { + "epoch": 0.629080118694362, + "grad_norm": 0.9663233660100905, + "learning_rate": 6.387282080459558e-06, + "loss": 0.9474, + "step": 3710 + }, + { + "epoch": 0.6292496820686732, + "grad_norm": 0.9385458433833092, + "learning_rate": 6.382161277290801e-06, + "loss": 0.911, + "step": 3711 + }, + { + "epoch": 0.6294192454429843, + "grad_norm": 0.9443882867952118, + "learning_rate": 6.377041565454021e-06, + "loss": 0.9579, + "step": 3712 + }, + { + "epoch": 0.6295888088172955, + "grad_norm": 0.9606984568748994, + "learning_rate": 6.3719229464935915e-06, + "loss": 0.9256, + "step": 3713 + }, + { + "epoch": 0.6297583721916066, + "grad_norm": 0.9267929555006322, + "learning_rate": 6.3668054219535616e-06, + "loss": 0.9418, + "step": 3714 + }, + { + "epoch": 0.6299279355659178, + "grad_norm": 0.9599468887830896, + "learning_rate": 6.361688993377642e-06, + "loss": 0.9569, + "step": 3715 + }, + { + "epoch": 0.6300974989402289, + "grad_norm": 0.9234688762554217, + "learning_rate": 6.356573662309227e-06, + "loss": 0.9111, + "step": 3716 + }, + { + "epoch": 0.6302670623145401, + "grad_norm": 0.9914524824202674, + "learning_rate": 6.351459430291369e-06, + "loss": 0.958, + "step": 3717 + }, + { + "epoch": 0.6304366256888512, + "grad_norm": 1.0145103772439585, + "learning_rate": 6.3463462988667855e-06, + "loss": 0.971, + "step": 3718 + }, + { + "epoch": 0.6306061890631623, + "grad_norm": 1.0210710754978762, + "learning_rate": 6.341234269577878e-06, + "loss": 0.9677, + "step": 3719 + }, + { + "epoch": 0.6307757524374735, + "grad_norm": 0.9369572208111215, + "learning_rate": 6.3361233439667e-06, + "loss": 0.9618, + "step": 3720 + }, + { + "epoch": 0.6309453158117847, + "grad_norm": 0.9963596004046189, + "learning_rate": 6.331013523574978e-06, + "loss": 0.9354, + "step": 3721 + }, + { + "epoch": 0.6311148791860958, + "grad_norm": 0.9594972352973306, + "learning_rate": 6.3259048099441045e-06, + "loss": 0.9471, + "step": 3722 + }, + { + "epoch": 0.6312844425604069, + "grad_norm": 0.9579331372509057, + "learning_rate": 6.32079720461514e-06, + "loss": 0.9437, + "step": 3723 + }, + { + "epoch": 0.6314540059347181, + "grad_norm": 0.9566115669505098, + "learning_rate": 6.315690709128808e-06, + "loss": 0.9227, + "step": 3724 + }, + { + "epoch": 0.6316235693090293, + "grad_norm": 0.9781590776047974, + "learning_rate": 6.310585325025499e-06, + "loss": 0.9944, + "step": 3725 + }, + { + "epoch": 0.6317931326833404, + "grad_norm": 0.9851828662730695, + "learning_rate": 6.305481053845262e-06, + "loss": 0.985, + "step": 3726 + }, + { + "epoch": 0.6319626960576515, + "grad_norm": 0.9656966272833261, + "learning_rate": 6.300377897127825e-06, + "loss": 0.9383, + "step": 3727 + }, + { + "epoch": 0.6321322594319627, + "grad_norm": 0.9701303138664904, + "learning_rate": 6.295275856412567e-06, + "loss": 0.9607, + "step": 3728 + }, + { + "epoch": 0.6323018228062739, + "grad_norm": 0.992522396568827, + "learning_rate": 6.290174933238531e-06, + "loss": 0.9474, + "step": 3729 + }, + { + "epoch": 0.632471386180585, + "grad_norm": 0.9427977179391128, + "learning_rate": 6.285075129144429e-06, + "loss": 0.8795, + "step": 3730 + }, + { + "epoch": 0.6326409495548961, + "grad_norm": 0.9823425163164353, + "learning_rate": 6.2799764456686326e-06, + "loss": 0.9573, + "step": 3731 + }, + { + "epoch": 0.6328105129292073, + "grad_norm": 0.9443080726175307, + "learning_rate": 6.274878884349174e-06, + "loss": 0.9437, + "step": 3732 + }, + { + "epoch": 0.6329800763035185, + "grad_norm": 0.9234219215681932, + "learning_rate": 6.2697824467237445e-06, + "loss": 0.9029, + "step": 3733 + }, + { + "epoch": 0.6331496396778296, + "grad_norm": 0.9590176411797248, + "learning_rate": 6.2646871343297055e-06, + "loss": 0.9433, + "step": 3734 + }, + { + "epoch": 0.6333192030521407, + "grad_norm": 0.9241641134740813, + "learning_rate": 6.259592948704073e-06, + "loss": 0.8862, + "step": 3735 + }, + { + "epoch": 0.6334887664264519, + "grad_norm": 0.9629136949939168, + "learning_rate": 6.254499891383517e-06, + "loss": 0.979, + "step": 3736 + }, + { + "epoch": 0.6336583298007631, + "grad_norm": 1.0129659639175814, + "learning_rate": 6.249407963904381e-06, + "loss": 0.9525, + "step": 3737 + }, + { + "epoch": 0.6338278931750742, + "grad_norm": 0.9634058449209946, + "learning_rate": 6.244317167802659e-06, + "loss": 0.9786, + "step": 3738 + }, + { + "epoch": 0.6339974565493853, + "grad_norm": 0.947641971181679, + "learning_rate": 6.239227504614004e-06, + "loss": 0.9791, + "step": 3739 + }, + { + "epoch": 0.6341670199236965, + "grad_norm": 0.9732105239017287, + "learning_rate": 6.234138975873724e-06, + "loss": 0.9164, + "step": 3740 + }, + { + "epoch": 0.6343365832980077, + "grad_norm": 0.9918580177844567, + "learning_rate": 6.229051583116796e-06, + "loss": 0.9375, + "step": 3741 + }, + { + "epoch": 0.6345061466723187, + "grad_norm": 0.9768923877564517, + "learning_rate": 6.223965327877846e-06, + "loss": 0.9313, + "step": 3742 + }, + { + "epoch": 0.6346757100466299, + "grad_norm": 0.9860459364488126, + "learning_rate": 6.218880211691154e-06, + "loss": 0.9539, + "step": 3743 + }, + { + "epoch": 0.6348452734209411, + "grad_norm": 0.9437759422673568, + "learning_rate": 6.213796236090661e-06, + "loss": 0.9562, + "step": 3744 + }, + { + "epoch": 0.6350148367952523, + "grad_norm": 0.9368994356142542, + "learning_rate": 6.208713402609968e-06, + "loss": 0.945, + "step": 3745 + }, + { + "epoch": 0.6351844001695633, + "grad_norm": 0.9429873960550915, + "learning_rate": 6.2036317127823264e-06, + "loss": 0.9271, + "step": 3746 + }, + { + "epoch": 0.6353539635438745, + "grad_norm": 1.0032998718661652, + "learning_rate": 6.198551168140638e-06, + "loss": 0.9877, + "step": 3747 + }, + { + "epoch": 0.6355235269181857, + "grad_norm": 0.986679958402424, + "learning_rate": 6.1934717702174714e-06, + "loss": 0.9737, + "step": 3748 + }, + { + "epoch": 0.6356930902924968, + "grad_norm": 0.9378020942782609, + "learning_rate": 6.1883935205450396e-06, + "loss": 0.9318, + "step": 3749 + }, + { + "epoch": 0.6358626536668079, + "grad_norm": 0.9341357952399267, + "learning_rate": 6.183316420655212e-06, + "loss": 0.9337, + "step": 3750 + }, + { + "epoch": 0.6360322170411191, + "grad_norm": 0.9382169870334379, + "learning_rate": 6.178240472079504e-06, + "loss": 0.9413, + "step": 3751 + }, + { + "epoch": 0.6362017804154303, + "grad_norm": 0.9653000561077801, + "learning_rate": 6.173165676349103e-06, + "loss": 0.9426, + "step": 3752 + }, + { + "epoch": 0.6363713437897414, + "grad_norm": 0.9758697136872103, + "learning_rate": 6.168092034994832e-06, + "loss": 0.9922, + "step": 3753 + }, + { + "epoch": 0.6365409071640525, + "grad_norm": 0.9569341048053986, + "learning_rate": 6.163019549547163e-06, + "loss": 0.9025, + "step": 3754 + }, + { + "epoch": 0.6367104705383637, + "grad_norm": 0.9378219496212093, + "learning_rate": 6.157948221536237e-06, + "loss": 0.9276, + "step": 3755 + }, + { + "epoch": 0.6368800339126749, + "grad_norm": 0.9232938646637593, + "learning_rate": 6.152878052491831e-06, + "loss": 0.9198, + "step": 3756 + }, + { + "epoch": 0.637049597286986, + "grad_norm": 0.9989026050269741, + "learning_rate": 6.1478090439433776e-06, + "loss": 0.9892, + "step": 3757 + }, + { + "epoch": 0.6372191606612971, + "grad_norm": 0.9691622920384532, + "learning_rate": 6.142741197419955e-06, + "loss": 0.9326, + "step": 3758 + }, + { + "epoch": 0.6373887240356083, + "grad_norm": 0.9522085365363817, + "learning_rate": 6.1376745144503e-06, + "loss": 0.9366, + "step": 3759 + }, + { + "epoch": 0.6375582874099195, + "grad_norm": 0.9582571265377816, + "learning_rate": 6.13260899656279e-06, + "loss": 0.9359, + "step": 3760 + }, + { + "epoch": 0.6377278507842306, + "grad_norm": 0.9364641420730115, + "learning_rate": 6.127544645285448e-06, + "loss": 0.9375, + "step": 3761 + }, + { + "epoch": 0.6378974141585417, + "grad_norm": 1.0005435890273477, + "learning_rate": 6.1224814621459625e-06, + "loss": 0.9519, + "step": 3762 + }, + { + "epoch": 0.6380669775328529, + "grad_norm": 0.9286981109389738, + "learning_rate": 6.117419448671651e-06, + "loss": 0.9446, + "step": 3763 + }, + { + "epoch": 0.6382365409071641, + "grad_norm": 0.9339352276437187, + "learning_rate": 6.112358606389488e-06, + "loss": 0.9589, + "step": 3764 + }, + { + "epoch": 0.6384061042814752, + "grad_norm": 1.0168912488267274, + "learning_rate": 6.107298936826086e-06, + "loss": 0.9731, + "step": 3765 + }, + { + "epoch": 0.6385756676557863, + "grad_norm": 0.9931397852253987, + "learning_rate": 6.102240441507716e-06, + "loss": 0.9505, + "step": 3766 + }, + { + "epoch": 0.6387452310300975, + "grad_norm": 0.9370962567417507, + "learning_rate": 6.097183121960286e-06, + "loss": 0.9326, + "step": 3767 + }, + { + "epoch": 0.6389147944044087, + "grad_norm": 0.9632737865310094, + "learning_rate": 6.092126979709354e-06, + "loss": 0.963, + "step": 3768 + }, + { + "epoch": 0.6390843577787197, + "grad_norm": 0.966099343780355, + "learning_rate": 6.087072016280111e-06, + "loss": 0.9369, + "step": 3769 + }, + { + "epoch": 0.6392539211530309, + "grad_norm": 0.9639035738860314, + "learning_rate": 6.082018233197415e-06, + "loss": 0.9436, + "step": 3770 + }, + { + "epoch": 0.6394234845273421, + "grad_norm": 0.9777180311286494, + "learning_rate": 6.07696563198575e-06, + "loss": 0.94, + "step": 3771 + }, + { + "epoch": 0.6395930479016533, + "grad_norm": 0.9860377213178608, + "learning_rate": 6.0719142141692435e-06, + "loss": 0.9461, + "step": 3772 + }, + { + "epoch": 0.6397626112759643, + "grad_norm": 0.9887980459181656, + "learning_rate": 6.066863981271678e-06, + "loss": 0.8976, + "step": 3773 + }, + { + "epoch": 0.6399321746502755, + "grad_norm": 0.9540848540347339, + "learning_rate": 6.06181493481647e-06, + "loss": 0.9479, + "step": 3774 + }, + { + "epoch": 0.6401017380245867, + "grad_norm": 0.9755088151515949, + "learning_rate": 6.0567670763266775e-06, + "loss": 0.9295, + "step": 3775 + }, + { + "epoch": 0.6402713013988979, + "grad_norm": 0.9276352113353674, + "learning_rate": 6.0517204073250015e-06, + "loss": 0.9438, + "step": 3776 + }, + { + "epoch": 0.6404408647732089, + "grad_norm": 0.9487214696110207, + "learning_rate": 6.046674929333787e-06, + "loss": 0.9228, + "step": 3777 + }, + { + "epoch": 0.6406104281475201, + "grad_norm": 0.975742034416424, + "learning_rate": 6.041630643875018e-06, + "loss": 0.9324, + "step": 3778 + }, + { + "epoch": 0.6407799915218313, + "grad_norm": 0.977433558812134, + "learning_rate": 6.036587552470313e-06, + "loss": 0.9122, + "step": 3779 + }, + { + "epoch": 0.6409495548961425, + "grad_norm": 0.926737421211832, + "learning_rate": 6.031545656640945e-06, + "loss": 0.9258, + "step": 3780 + }, + { + "epoch": 0.6411191182704535, + "grad_norm": 0.9530536859805545, + "learning_rate": 6.0265049579078125e-06, + "loss": 0.9563, + "step": 3781 + }, + { + "epoch": 0.6412886816447647, + "grad_norm": 0.9415541557942966, + "learning_rate": 6.021465457791458e-06, + "loss": 0.914, + "step": 3782 + }, + { + "epoch": 0.6414582450190759, + "grad_norm": 0.9704935297194401, + "learning_rate": 6.016427157812057e-06, + "loss": 0.9522, + "step": 3783 + }, + { + "epoch": 0.6416278083933871, + "grad_norm": 0.9813209749295353, + "learning_rate": 6.011390059489437e-06, + "loss": 0.9371, + "step": 3784 + }, + { + "epoch": 0.6417973717676981, + "grad_norm": 1.0186898672041642, + "learning_rate": 6.006354164343047e-06, + "loss": 0.9827, + "step": 3785 + }, + { + "epoch": 0.6419669351420093, + "grad_norm": 0.919147310639371, + "learning_rate": 6.0013194738919836e-06, + "loss": 0.9132, + "step": 3786 + }, + { + "epoch": 0.6421364985163205, + "grad_norm": 0.9869698742429506, + "learning_rate": 5.9962859896549695e-06, + "loss": 0.97, + "step": 3787 + }, + { + "epoch": 0.6423060618906317, + "grad_norm": 0.9440495931648389, + "learning_rate": 5.99125371315038e-06, + "loss": 0.9385, + "step": 3788 + }, + { + "epoch": 0.6424756252649427, + "grad_norm": 0.6225933578146026, + "learning_rate": 5.986222645896214e-06, + "loss": 0.7874, + "step": 3789 + }, + { + "epoch": 0.6426451886392539, + "grad_norm": 0.9438024709284348, + "learning_rate": 5.981192789410101e-06, + "loss": 0.9397, + "step": 3790 + }, + { + "epoch": 0.6428147520135651, + "grad_norm": 0.932640101026457, + "learning_rate": 5.9761641452093225e-06, + "loss": 0.9469, + "step": 3791 + }, + { + "epoch": 0.6429843153878763, + "grad_norm": 0.9703739634846951, + "learning_rate": 5.971136714810779e-06, + "loss": 0.9159, + "step": 3792 + }, + { + "epoch": 0.6431538787621873, + "grad_norm": 0.9938946680382339, + "learning_rate": 5.96611049973101e-06, + "loss": 0.9509, + "step": 3793 + }, + { + "epoch": 0.6433234421364985, + "grad_norm": 0.9557716240954923, + "learning_rate": 5.961085501486188e-06, + "loss": 0.9451, + "step": 3794 + }, + { + "epoch": 0.6434930055108097, + "grad_norm": 0.8732046374030294, + "learning_rate": 5.956061721592121e-06, + "loss": 0.939, + "step": 3795 + }, + { + "epoch": 0.6436625688851209, + "grad_norm": 0.5954221307547285, + "learning_rate": 5.951039161564247e-06, + "loss": 0.7569, + "step": 3796 + }, + { + "epoch": 0.6438321322594319, + "grad_norm": 0.9719689605506557, + "learning_rate": 5.946017822917632e-06, + "loss": 0.9531, + "step": 3797 + }, + { + "epoch": 0.6440016956337431, + "grad_norm": 0.9339359663445854, + "learning_rate": 5.940997707166986e-06, + "loss": 0.9529, + "step": 3798 + }, + { + "epoch": 0.6441712590080543, + "grad_norm": 0.9944675742016742, + "learning_rate": 5.935978815826638e-06, + "loss": 0.965, + "step": 3799 + }, + { + "epoch": 0.6443408223823655, + "grad_norm": 0.9548131645939382, + "learning_rate": 5.9309611504105505e-06, + "loss": 0.933, + "step": 3800 + }, + { + "epoch": 0.6445103857566765, + "grad_norm": 0.9704580915180497, + "learning_rate": 5.925944712432317e-06, + "loss": 0.9392, + "step": 3801 + }, + { + "epoch": 0.6446799491309877, + "grad_norm": 0.9499939702296947, + "learning_rate": 5.920929503405162e-06, + "loss": 0.9333, + "step": 3802 + }, + { + "epoch": 0.6448495125052989, + "grad_norm": 0.9651937529700388, + "learning_rate": 5.915915524841941e-06, + "loss": 0.9614, + "step": 3803 + }, + { + "epoch": 0.64501907587961, + "grad_norm": 0.9698160965435713, + "learning_rate": 5.910902778255134e-06, + "loss": 0.9344, + "step": 3804 + }, + { + "epoch": 0.6451886392539211, + "grad_norm": 0.9611238286463303, + "learning_rate": 5.905891265156849e-06, + "loss": 0.9399, + "step": 3805 + }, + { + "epoch": 0.6453582026282323, + "grad_norm": 0.9064569039707927, + "learning_rate": 5.9008809870588276e-06, + "loss": 0.9243, + "step": 3806 + }, + { + "epoch": 0.6455277660025435, + "grad_norm": 0.9173716267102014, + "learning_rate": 5.895871945472434e-06, + "loss": 0.9118, + "step": 3807 + }, + { + "epoch": 0.6456973293768546, + "grad_norm": 0.9376133180151249, + "learning_rate": 5.890864141908656e-06, + "loss": 0.9075, + "step": 3808 + }, + { + "epoch": 0.6458668927511657, + "grad_norm": 0.9605866500487444, + "learning_rate": 5.885857577878122e-06, + "loss": 0.9589, + "step": 3809 + }, + { + "epoch": 0.6460364561254769, + "grad_norm": 0.984050906627267, + "learning_rate": 5.880852254891072e-06, + "loss": 0.9306, + "step": 3810 + }, + { + "epoch": 0.6462060194997881, + "grad_norm": 0.9734765748501751, + "learning_rate": 5.875848174457377e-06, + "loss": 0.9293, + "step": 3811 + }, + { + "epoch": 0.6463755828740992, + "grad_norm": 0.9684380212094916, + "learning_rate": 5.870845338086532e-06, + "loss": 0.9011, + "step": 3812 + }, + { + "epoch": 0.6465451462484103, + "grad_norm": 0.9709957897262544, + "learning_rate": 5.865843747287659e-06, + "loss": 0.9724, + "step": 3813 + }, + { + "epoch": 0.6467147096227215, + "grad_norm": 1.0134347473367822, + "learning_rate": 5.860843403569504e-06, + "loss": 0.9356, + "step": 3814 + }, + { + "epoch": 0.6468842729970327, + "grad_norm": 0.9605237921787253, + "learning_rate": 5.855844308440429e-06, + "loss": 0.9342, + "step": 3815 + }, + { + "epoch": 0.6470538363713438, + "grad_norm": 0.9836826905598642, + "learning_rate": 5.850846463408437e-06, + "loss": 0.931, + "step": 3816 + }, + { + "epoch": 0.6472233997456549, + "grad_norm": 1.0025283263163867, + "learning_rate": 5.845849869981137e-06, + "loss": 0.961, + "step": 3817 + }, + { + "epoch": 0.6473929631199661, + "grad_norm": 1.0059098553739443, + "learning_rate": 5.840854529665767e-06, + "loss": 0.9505, + "step": 3818 + }, + { + "epoch": 0.6475625264942773, + "grad_norm": 0.9315054540105335, + "learning_rate": 5.835860443969185e-06, + "loss": 0.9187, + "step": 3819 + }, + { + "epoch": 0.6477320898685884, + "grad_norm": 0.9623265219691896, + "learning_rate": 5.830867614397876e-06, + "loss": 0.9246, + "step": 3820 + }, + { + "epoch": 0.6479016532428995, + "grad_norm": 0.9317562830213191, + "learning_rate": 5.825876042457939e-06, + "loss": 0.9115, + "step": 3821 + }, + { + "epoch": 0.6480712166172107, + "grad_norm": 0.9688940693021881, + "learning_rate": 5.820885729655098e-06, + "loss": 0.9706, + "step": 3822 + }, + { + "epoch": 0.6482407799915219, + "grad_norm": 0.9099049956166796, + "learning_rate": 5.815896677494692e-06, + "loss": 0.9486, + "step": 3823 + }, + { + "epoch": 0.648410343365833, + "grad_norm": 0.952563353729191, + "learning_rate": 5.81090888748169e-06, + "loss": 0.9336, + "step": 3824 + }, + { + "epoch": 0.6485799067401441, + "grad_norm": 0.9591061101884213, + "learning_rate": 5.8059223611206716e-06, + "loss": 0.9056, + "step": 3825 + }, + { + "epoch": 0.6487494701144553, + "grad_norm": 0.9856020764612605, + "learning_rate": 5.800937099915833e-06, + "loss": 0.9542, + "step": 3826 + }, + { + "epoch": 0.6489190334887665, + "grad_norm": 1.0106116342078262, + "learning_rate": 5.795953105371e-06, + "loss": 0.9407, + "step": 3827 + }, + { + "epoch": 0.6490885968630776, + "grad_norm": 0.9591800443987448, + "learning_rate": 5.790970378989609e-06, + "loss": 0.9132, + "step": 3828 + }, + { + "epoch": 0.6492581602373887, + "grad_norm": 0.9370959826643205, + "learning_rate": 5.785988922274711e-06, + "loss": 0.9464, + "step": 3829 + }, + { + "epoch": 0.6494277236116999, + "grad_norm": 1.0294550988290696, + "learning_rate": 5.781008736728975e-06, + "loss": 0.9335, + "step": 3830 + }, + { + "epoch": 0.649597286986011, + "grad_norm": 0.9735705262145118, + "learning_rate": 5.776029823854697e-06, + "loss": 0.9872, + "step": 3831 + }, + { + "epoch": 0.6497668503603222, + "grad_norm": 0.9485886272470302, + "learning_rate": 5.771052185153776e-06, + "loss": 0.9537, + "step": 3832 + }, + { + "epoch": 0.6499364137346333, + "grad_norm": 0.9329816930282926, + "learning_rate": 5.766075822127735e-06, + "loss": 0.9214, + "step": 3833 + }, + { + "epoch": 0.6501059771089445, + "grad_norm": 0.9305533041366779, + "learning_rate": 5.761100736277704e-06, + "loss": 0.8937, + "step": 3834 + }, + { + "epoch": 0.6502755404832556, + "grad_norm": 0.9386634109485363, + "learning_rate": 5.756126929104435e-06, + "loss": 0.953, + "step": 3835 + }, + { + "epoch": 0.6504451038575668, + "grad_norm": 0.9571181726780283, + "learning_rate": 5.7511544021082945e-06, + "loss": 0.9222, + "step": 3836 + }, + { + "epoch": 0.6506146672318779, + "grad_norm": 1.0052270376815449, + "learning_rate": 5.746183156789252e-06, + "loss": 0.9686, + "step": 3837 + }, + { + "epoch": 0.6507842306061891, + "grad_norm": 0.9892800739945252, + "learning_rate": 5.741213194646911e-06, + "loss": 0.9593, + "step": 3838 + }, + { + "epoch": 0.6509537939805002, + "grad_norm": 0.92406570230854, + "learning_rate": 5.736244517180467e-06, + "loss": 0.8803, + "step": 3839 + }, + { + "epoch": 0.6511233573548114, + "grad_norm": 0.9683825539874887, + "learning_rate": 5.731277125888739e-06, + "loss": 0.9169, + "step": 3840 + }, + { + "epoch": 0.6512929207291225, + "grad_norm": 0.9381000607399452, + "learning_rate": 5.726311022270152e-06, + "loss": 0.9638, + "step": 3841 + }, + { + "epoch": 0.6514624841034337, + "grad_norm": 0.9684415990772047, + "learning_rate": 5.721346207822753e-06, + "loss": 0.9062, + "step": 3842 + }, + { + "epoch": 0.6516320474777448, + "grad_norm": 0.9829332906301349, + "learning_rate": 5.716382684044191e-06, + "loss": 0.9929, + "step": 3843 + }, + { + "epoch": 0.6518016108520559, + "grad_norm": 0.9709442787331626, + "learning_rate": 5.711420452431721e-06, + "loss": 0.9419, + "step": 3844 + }, + { + "epoch": 0.6519711742263671, + "grad_norm": 0.9892489756371844, + "learning_rate": 5.706459514482226e-06, + "loss": 0.9546, + "step": 3845 + }, + { + "epoch": 0.6521407376006783, + "grad_norm": 0.9324614807963302, + "learning_rate": 5.701499871692182e-06, + "loss": 0.8961, + "step": 3846 + }, + { + "epoch": 0.6523103009749894, + "grad_norm": 0.9628225703296495, + "learning_rate": 5.696541525557682e-06, + "loss": 0.917, + "step": 3847 + }, + { + "epoch": 0.6524798643493005, + "grad_norm": 1.002203238049265, + "learning_rate": 5.691584477574419e-06, + "loss": 0.9333, + "step": 3848 + }, + { + "epoch": 0.6526494277236117, + "grad_norm": 0.9661109640999909, + "learning_rate": 5.686628729237713e-06, + "loss": 0.9547, + "step": 3849 + }, + { + "epoch": 0.6528189910979229, + "grad_norm": 0.6236182732040507, + "learning_rate": 5.681674282042475e-06, + "loss": 0.8047, + "step": 3850 + }, + { + "epoch": 0.652988554472234, + "grad_norm": 0.8645617052005022, + "learning_rate": 5.676721137483226e-06, + "loss": 0.8993, + "step": 3851 + }, + { + "epoch": 0.6531581178465451, + "grad_norm": 1.004583038148914, + "learning_rate": 5.671769297054103e-06, + "loss": 0.9492, + "step": 3852 + }, + { + "epoch": 0.6533276812208563, + "grad_norm": 0.9631994308494995, + "learning_rate": 5.66681876224884e-06, + "loss": 0.9508, + "step": 3853 + }, + { + "epoch": 0.6534972445951674, + "grad_norm": 1.010882760062839, + "learning_rate": 5.661869534560782e-06, + "loss": 0.9654, + "step": 3854 + }, + { + "epoch": 0.6536668079694786, + "grad_norm": 0.968272483345355, + "learning_rate": 5.6569216154828776e-06, + "loss": 0.9242, + "step": 3855 + }, + { + "epoch": 0.6538363713437897, + "grad_norm": 0.9977732737322774, + "learning_rate": 5.6519750065076815e-06, + "loss": 0.9713, + "step": 3856 + }, + { + "epoch": 0.6540059347181009, + "grad_norm": 1.065488571152939, + "learning_rate": 5.647029709127355e-06, + "loss": 0.9944, + "step": 3857 + }, + { + "epoch": 0.654175498092412, + "grad_norm": 0.9604559546730017, + "learning_rate": 5.642085724833656e-06, + "loss": 0.9414, + "step": 3858 + }, + { + "epoch": 0.6543450614667232, + "grad_norm": 0.9678646760492071, + "learning_rate": 5.637143055117959e-06, + "loss": 0.9472, + "step": 3859 + }, + { + "epoch": 0.6545146248410343, + "grad_norm": 0.9492250514742354, + "learning_rate": 5.632201701471236e-06, + "loss": 0.9029, + "step": 3860 + }, + { + "epoch": 0.6546841882153455, + "grad_norm": 0.9879380930784467, + "learning_rate": 5.627261665384056e-06, + "loss": 0.9553, + "step": 3861 + }, + { + "epoch": 0.6548537515896566, + "grad_norm": 1.0105991334684823, + "learning_rate": 5.622322948346595e-06, + "loss": 0.9474, + "step": 3862 + }, + { + "epoch": 0.6550233149639678, + "grad_norm": 0.960704636250499, + "learning_rate": 5.6173855518486385e-06, + "loss": 0.9325, + "step": 3863 + }, + { + "epoch": 0.6551928783382789, + "grad_norm": 0.5786385742975265, + "learning_rate": 5.612449477379564e-06, + "loss": 0.7499, + "step": 3864 + }, + { + "epoch": 0.6553624417125901, + "grad_norm": 1.025347047899016, + "learning_rate": 5.6075147264283526e-06, + "loss": 0.9808, + "step": 3865 + }, + { + "epoch": 0.6555320050869012, + "grad_norm": 0.6307921451292178, + "learning_rate": 5.602581300483583e-06, + "loss": 0.793, + "step": 3866 + }, + { + "epoch": 0.6557015684612124, + "grad_norm": 1.0464031003151537, + "learning_rate": 5.597649201033446e-06, + "loss": 1.006, + "step": 3867 + }, + { + "epoch": 0.6558711318355235, + "grad_norm": 0.9623465383139722, + "learning_rate": 5.592718429565721e-06, + "loss": 0.9097, + "step": 3868 + }, + { + "epoch": 0.6560406952098347, + "grad_norm": 0.9668153720541863, + "learning_rate": 5.587788987567785e-06, + "loss": 0.9648, + "step": 3869 + }, + { + "epoch": 0.6562102585841458, + "grad_norm": 0.9592803462912153, + "learning_rate": 5.582860876526628e-06, + "loss": 0.9555, + "step": 3870 + }, + { + "epoch": 0.656379821958457, + "grad_norm": 0.9203939591580858, + "learning_rate": 5.577934097928824e-06, + "loss": 0.8892, + "step": 3871 + }, + { + "epoch": 0.6565493853327681, + "grad_norm": 0.9895137644750535, + "learning_rate": 5.573008653260552e-06, + "loss": 0.9467, + "step": 3872 + }, + { + "epoch": 0.6567189487070793, + "grad_norm": 0.9887888152140254, + "learning_rate": 5.5680845440075885e-06, + "loss": 0.937, + "step": 3873 + }, + { + "epoch": 0.6568885120813904, + "grad_norm": 0.940673035184092, + "learning_rate": 5.5631617716553035e-06, + "loss": 0.9485, + "step": 3874 + }, + { + "epoch": 0.6570580754557016, + "grad_norm": 0.6544679125043726, + "learning_rate": 5.558240337688667e-06, + "loss": 0.7899, + "step": 3875 + }, + { + "epoch": 0.6572276388300127, + "grad_norm": 0.9255038306115664, + "learning_rate": 5.553320243592239e-06, + "loss": 0.9038, + "step": 3876 + }, + { + "epoch": 0.6573972022043238, + "grad_norm": 0.9437432305339511, + "learning_rate": 5.548401490850193e-06, + "loss": 0.9277, + "step": 3877 + }, + { + "epoch": 0.657566765578635, + "grad_norm": 0.9704995551272598, + "learning_rate": 5.5434840809462775e-06, + "loss": 0.9464, + "step": 3878 + }, + { + "epoch": 0.6577363289529462, + "grad_norm": 1.0094997653032582, + "learning_rate": 5.538568015363846e-06, + "loss": 0.9902, + "step": 3879 + }, + { + "epoch": 0.6579058923272573, + "grad_norm": 0.9978035361638913, + "learning_rate": 5.533653295585839e-06, + "loss": 0.9303, + "step": 3880 + }, + { + "epoch": 0.6580754557015684, + "grad_norm": 0.9707163439981172, + "learning_rate": 5.528739923094806e-06, + "loss": 0.9003, + "step": 3881 + }, + { + "epoch": 0.6582450190758796, + "grad_norm": 0.9568219375540896, + "learning_rate": 5.523827899372876e-06, + "loss": 0.9279, + "step": 3882 + }, + { + "epoch": 0.6584145824501908, + "grad_norm": 0.9631106481386111, + "learning_rate": 5.518917225901777e-06, + "loss": 0.9222, + "step": 3883 + }, + { + "epoch": 0.6585841458245019, + "grad_norm": 0.971706362781142, + "learning_rate": 5.514007904162822e-06, + "loss": 0.9283, + "step": 3884 + }, + { + "epoch": 0.658753709198813, + "grad_norm": 0.9341604651522025, + "learning_rate": 5.509099935636932e-06, + "loss": 0.95, + "step": 3885 + }, + { + "epoch": 0.6589232725731242, + "grad_norm": 0.9506504484405172, + "learning_rate": 5.504193321804607e-06, + "loss": 0.899, + "step": 3886 + }, + { + "epoch": 0.6590928359474354, + "grad_norm": 1.0314193236694018, + "learning_rate": 5.499288064145938e-06, + "loss": 0.9285, + "step": 3887 + }, + { + "epoch": 0.6592623993217465, + "grad_norm": 0.9683228044749517, + "learning_rate": 5.4943841641406185e-06, + "loss": 0.9423, + "step": 3888 + }, + { + "epoch": 0.6594319626960576, + "grad_norm": 0.9681478850057504, + "learning_rate": 5.4894816232679195e-06, + "loss": 0.9453, + "step": 3889 + }, + { + "epoch": 0.6596015260703688, + "grad_norm": 0.971662525395111, + "learning_rate": 5.484580443006709e-06, + "loss": 0.9455, + "step": 3890 + }, + { + "epoch": 0.65977108944468, + "grad_norm": 0.9107106240362632, + "learning_rate": 5.4796806248354416e-06, + "loss": 0.9114, + "step": 3891 + }, + { + "epoch": 0.6599406528189911, + "grad_norm": 0.9915398558162213, + "learning_rate": 5.474782170232163e-06, + "loss": 1.0068, + "step": 3892 + }, + { + "epoch": 0.6601102161933022, + "grad_norm": 0.9827792870127341, + "learning_rate": 5.469885080674508e-06, + "loss": 0.9699, + "step": 3893 + }, + { + "epoch": 0.6602797795676134, + "grad_norm": 0.9440423204304675, + "learning_rate": 5.464989357639692e-06, + "loss": 0.9423, + "step": 3894 + }, + { + "epoch": 0.6604493429419246, + "grad_norm": 0.957413072906512, + "learning_rate": 5.460095002604533e-06, + "loss": 0.9066, + "step": 3895 + }, + { + "epoch": 0.6606189063162357, + "grad_norm": 0.9779414388238925, + "learning_rate": 5.455202017045425e-06, + "loss": 0.9446, + "step": 3896 + }, + { + "epoch": 0.6607884696905468, + "grad_norm": 0.9884500073975315, + "learning_rate": 5.450310402438353e-06, + "loss": 0.9554, + "step": 3897 + }, + { + "epoch": 0.660958033064858, + "grad_norm": 0.9643114567036216, + "learning_rate": 5.445420160258881e-06, + "loss": 0.9272, + "step": 3898 + }, + { + "epoch": 0.6611275964391692, + "grad_norm": 0.9371902506912387, + "learning_rate": 5.440531291982173e-06, + "loss": 0.9244, + "step": 3899 + }, + { + "epoch": 0.6612971598134803, + "grad_norm": 1.0093923687858606, + "learning_rate": 5.435643799082969e-06, + "loss": 0.9682, + "step": 3900 + }, + { + "epoch": 0.6614667231877914, + "grad_norm": 0.970835594452102, + "learning_rate": 5.4307576830355945e-06, + "loss": 0.9337, + "step": 3901 + }, + { + "epoch": 0.6616362865621026, + "grad_norm": 1.0176182433371783, + "learning_rate": 5.425872945313959e-06, + "loss": 0.9603, + "step": 3902 + }, + { + "epoch": 0.6618058499364138, + "grad_norm": 0.9727334570886468, + "learning_rate": 5.420989587391564e-06, + "loss": 0.9417, + "step": 3903 + }, + { + "epoch": 0.6619754133107248, + "grad_norm": 0.9300390063756614, + "learning_rate": 5.416107610741487e-06, + "loss": 0.9452, + "step": 3904 + }, + { + "epoch": 0.662144976685036, + "grad_norm": 1.0049390271482403, + "learning_rate": 5.4112270168363854e-06, + "loss": 0.9614, + "step": 3905 + }, + { + "epoch": 0.6623145400593472, + "grad_norm": 0.988710600579722, + "learning_rate": 5.406347807148515e-06, + "loss": 0.9333, + "step": 3906 + }, + { + "epoch": 0.6624841034336584, + "grad_norm": 0.9856461056387464, + "learning_rate": 5.401469983149699e-06, + "loss": 0.9422, + "step": 3907 + }, + { + "epoch": 0.6626536668079694, + "grad_norm": 0.9737655760456169, + "learning_rate": 5.396593546311346e-06, + "loss": 0.919, + "step": 3908 + }, + { + "epoch": 0.6628232301822806, + "grad_norm": 0.9382680904507146, + "learning_rate": 5.391718498104451e-06, + "loss": 0.9216, + "step": 3909 + }, + { + "epoch": 0.6629927935565918, + "grad_norm": 0.9878460275517319, + "learning_rate": 5.386844839999586e-06, + "loss": 0.9749, + "step": 3910 + }, + { + "epoch": 0.663162356930903, + "grad_norm": 0.9358420654936995, + "learning_rate": 5.381972573466905e-06, + "loss": 0.9314, + "step": 3911 + }, + { + "epoch": 0.663331920305214, + "grad_norm": 1.057242037704201, + "learning_rate": 5.377101699976135e-06, + "loss": 0.8988, + "step": 3912 + }, + { + "epoch": 0.6635014836795252, + "grad_norm": 0.9807759525794161, + "learning_rate": 5.3722322209966024e-06, + "loss": 0.9191, + "step": 3913 + }, + { + "epoch": 0.6636710470538364, + "grad_norm": 0.9838550976211243, + "learning_rate": 5.367364137997193e-06, + "loss": 0.8975, + "step": 3914 + }, + { + "epoch": 0.6638406104281476, + "grad_norm": 0.9858722626023122, + "learning_rate": 5.362497452446379e-06, + "loss": 0.9523, + "step": 3915 + }, + { + "epoch": 0.6640101738024586, + "grad_norm": 0.9917628083410989, + "learning_rate": 5.357632165812208e-06, + "loss": 0.9357, + "step": 3916 + }, + { + "epoch": 0.6641797371767698, + "grad_norm": 0.9690695087891623, + "learning_rate": 5.352768279562315e-06, + "loss": 0.9444, + "step": 3917 + }, + { + "epoch": 0.664349300551081, + "grad_norm": 0.9701944233249534, + "learning_rate": 5.3479057951639034e-06, + "loss": 0.9505, + "step": 3918 + }, + { + "epoch": 0.6645188639253922, + "grad_norm": 0.9871633133861889, + "learning_rate": 5.343044714083756e-06, + "loss": 0.9445, + "step": 3919 + }, + { + "epoch": 0.6646884272997032, + "grad_norm": 0.973757320636307, + "learning_rate": 5.338185037788228e-06, + "loss": 0.945, + "step": 3920 + }, + { + "epoch": 0.6648579906740144, + "grad_norm": 0.9400542642201342, + "learning_rate": 5.333326767743263e-06, + "loss": 0.9071, + "step": 3921 + }, + { + "epoch": 0.6650275540483256, + "grad_norm": 0.9742709175463057, + "learning_rate": 5.3284699054143705e-06, + "loss": 0.9504, + "step": 3922 + }, + { + "epoch": 0.6651971174226368, + "grad_norm": 0.9640296549247036, + "learning_rate": 5.323614452266632e-06, + "loss": 0.9329, + "step": 3923 + }, + { + "epoch": 0.6653666807969478, + "grad_norm": 1.0002729476983498, + "learning_rate": 5.318760409764718e-06, + "loss": 0.9387, + "step": 3924 + }, + { + "epoch": 0.665536244171259, + "grad_norm": 0.9423150822636535, + "learning_rate": 5.313907779372862e-06, + "loss": 0.9324, + "step": 3925 + }, + { + "epoch": 0.6657058075455702, + "grad_norm": 1.0138549107572683, + "learning_rate": 5.3090565625548755e-06, + "loss": 0.9453, + "step": 3926 + }, + { + "epoch": 0.6658753709198814, + "grad_norm": 0.9378443199931459, + "learning_rate": 5.304206760774139e-06, + "loss": 0.9117, + "step": 3927 + }, + { + "epoch": 0.6660449342941924, + "grad_norm": 0.9330418978120747, + "learning_rate": 5.299358375493613e-06, + "loss": 0.9395, + "step": 3928 + }, + { + "epoch": 0.6662144976685036, + "grad_norm": 0.9606426036261803, + "learning_rate": 5.294511408175825e-06, + "loss": 0.947, + "step": 3929 + }, + { + "epoch": 0.6663840610428148, + "grad_norm": 0.9418675926648232, + "learning_rate": 5.289665860282877e-06, + "loss": 0.9268, + "step": 3930 + }, + { + "epoch": 0.666553624417126, + "grad_norm": 0.964392078095706, + "learning_rate": 5.2848217332764476e-06, + "loss": 0.9234, + "step": 3931 + }, + { + "epoch": 0.666723187791437, + "grad_norm": 0.9787977772815747, + "learning_rate": 5.279979028617781e-06, + "loss": 0.9418, + "step": 3932 + }, + { + "epoch": 0.6668927511657482, + "grad_norm": 0.9123543852829185, + "learning_rate": 5.275137747767691e-06, + "loss": 0.9292, + "step": 3933 + }, + { + "epoch": 0.6670623145400594, + "grad_norm": 0.9175440557504575, + "learning_rate": 5.270297892186563e-06, + "loss": 0.9214, + "step": 3934 + }, + { + "epoch": 0.6672318779143704, + "grad_norm": 0.9707448908463467, + "learning_rate": 5.265459463334361e-06, + "loss": 0.9442, + "step": 3935 + }, + { + "epoch": 0.6674014412886816, + "grad_norm": 0.9549867467941378, + "learning_rate": 5.260622462670608e-06, + "loss": 0.9336, + "step": 3936 + }, + { + "epoch": 0.6675710046629928, + "grad_norm": 1.0282995465836202, + "learning_rate": 5.2557868916543996e-06, + "loss": 0.9546, + "step": 3937 + }, + { + "epoch": 0.667740568037304, + "grad_norm": 0.9140365773329413, + "learning_rate": 5.250952751744396e-06, + "loss": 0.954, + "step": 3938 + }, + { + "epoch": 0.667910131411615, + "grad_norm": 0.9342080110904891, + "learning_rate": 5.246120044398839e-06, + "loss": 0.8971, + "step": 3939 + }, + { + "epoch": 0.6680796947859262, + "grad_norm": 0.9388258644564519, + "learning_rate": 5.241288771075526e-06, + "loss": 0.9261, + "step": 3940 + }, + { + "epoch": 0.6682492581602374, + "grad_norm": 0.9848489745521355, + "learning_rate": 5.236458933231818e-06, + "loss": 0.9394, + "step": 3941 + }, + { + "epoch": 0.6684188215345486, + "grad_norm": 0.9546401120623125, + "learning_rate": 5.231630532324661e-06, + "loss": 0.923, + "step": 3942 + }, + { + "epoch": 0.6685883849088596, + "grad_norm": 0.9790372986013418, + "learning_rate": 5.226803569810552e-06, + "loss": 0.9494, + "step": 3943 + }, + { + "epoch": 0.6687579482831708, + "grad_norm": 0.9511914741826248, + "learning_rate": 5.221978047145559e-06, + "loss": 0.9142, + "step": 3944 + }, + { + "epoch": 0.668927511657482, + "grad_norm": 0.979204322958994, + "learning_rate": 5.217153965785315e-06, + "loss": 0.9561, + "step": 3945 + }, + { + "epoch": 0.6690970750317932, + "grad_norm": 0.9882434520547869, + "learning_rate": 5.21233132718502e-06, + "loss": 0.9754, + "step": 3946 + }, + { + "epoch": 0.6692666384061042, + "grad_norm": 1.0240698051581076, + "learning_rate": 5.207510132799436e-06, + "loss": 0.9738, + "step": 3947 + }, + { + "epoch": 0.6694362017804154, + "grad_norm": 0.9583803572017214, + "learning_rate": 5.2026903840828864e-06, + "loss": 0.9401, + "step": 3948 + }, + { + "epoch": 0.6696057651547266, + "grad_norm": 0.9195709660329189, + "learning_rate": 5.1978720824892725e-06, + "loss": 0.9599, + "step": 3949 + }, + { + "epoch": 0.6697753285290378, + "grad_norm": 0.9666904386466108, + "learning_rate": 5.193055229472045e-06, + "loss": 0.9435, + "step": 3950 + }, + { + "epoch": 0.6699448919033488, + "grad_norm": 0.9531567204711313, + "learning_rate": 5.1882398264842225e-06, + "loss": 0.9093, + "step": 3951 + }, + { + "epoch": 0.67011445527766, + "grad_norm": 0.9936746290244673, + "learning_rate": 5.1834258749783805e-06, + "loss": 0.9662, + "step": 3952 + }, + { + "epoch": 0.6702840186519712, + "grad_norm": 0.9551629932249377, + "learning_rate": 5.178613376406672e-06, + "loss": 0.9403, + "step": 3953 + }, + { + "epoch": 0.6704535820262824, + "grad_norm": 0.921796117153545, + "learning_rate": 5.173802332220795e-06, + "loss": 0.9393, + "step": 3954 + }, + { + "epoch": 0.6706231454005934, + "grad_norm": 0.9507810627843465, + "learning_rate": 5.168992743872019e-06, + "loss": 0.922, + "step": 3955 + }, + { + "epoch": 0.6707927087749046, + "grad_norm": 0.9554163157405162, + "learning_rate": 5.164184612811164e-06, + "loss": 0.9528, + "step": 3956 + }, + { + "epoch": 0.6709622721492158, + "grad_norm": 0.9974234717396072, + "learning_rate": 5.1593779404886255e-06, + "loss": 0.8856, + "step": 3957 + }, + { + "epoch": 0.671131835523527, + "grad_norm": 0.9699720154322813, + "learning_rate": 5.154572728354349e-06, + "loss": 0.9459, + "step": 3958 + }, + { + "epoch": 0.671301398897838, + "grad_norm": 0.9356105747656451, + "learning_rate": 5.149768977857835e-06, + "loss": 0.921, + "step": 3959 + }, + { + "epoch": 0.6714709622721492, + "grad_norm": 0.9851471475682955, + "learning_rate": 5.144966690448159e-06, + "loss": 0.998, + "step": 3960 + }, + { + "epoch": 0.6716405256464604, + "grad_norm": 0.9585842581837655, + "learning_rate": 5.14016586757394e-06, + "loss": 0.9581, + "step": 3961 + }, + { + "epoch": 0.6718100890207716, + "grad_norm": 0.9518273242639163, + "learning_rate": 5.135366510683361e-06, + "loss": 0.9256, + "step": 3962 + }, + { + "epoch": 0.6719796523950826, + "grad_norm": 1.0145463954503386, + "learning_rate": 5.130568621224162e-06, + "loss": 0.9494, + "step": 3963 + }, + { + "epoch": 0.6721492157693938, + "grad_norm": 0.9948365564700787, + "learning_rate": 5.125772200643643e-06, + "loss": 0.9414, + "step": 3964 + }, + { + "epoch": 0.672318779143705, + "grad_norm": 0.9609231919465913, + "learning_rate": 5.120977250388657e-06, + "loss": 0.9439, + "step": 3965 + }, + { + "epoch": 0.6724883425180161, + "grad_norm": 0.9000482538254113, + "learning_rate": 5.116183771905612e-06, + "loss": 0.9313, + "step": 3966 + }, + { + "epoch": 0.6726579058923272, + "grad_norm": 0.9249819738921866, + "learning_rate": 5.111391766640481e-06, + "loss": 0.9061, + "step": 3967 + }, + { + "epoch": 0.6728274692666384, + "grad_norm": 0.9762897079163759, + "learning_rate": 5.106601236038786e-06, + "loss": 0.9753, + "step": 3968 + }, + { + "epoch": 0.6729970326409496, + "grad_norm": 0.9650549983853628, + "learning_rate": 5.1018121815456045e-06, + "loss": 0.943, + "step": 3969 + }, + { + "epoch": 0.6731665960152607, + "grad_norm": 0.5990811662072381, + "learning_rate": 5.097024604605563e-06, + "loss": 0.7577, + "step": 3970 + }, + { + "epoch": 0.6733361593895718, + "grad_norm": 1.0202747671421386, + "learning_rate": 5.092238506662859e-06, + "loss": 0.9622, + "step": 3971 + }, + { + "epoch": 0.673505722763883, + "grad_norm": 0.9490346582744696, + "learning_rate": 5.087453889161229e-06, + "loss": 0.9248, + "step": 3972 + }, + { + "epoch": 0.6736752861381942, + "grad_norm": 0.94744196292728, + "learning_rate": 5.082670753543961e-06, + "loss": 0.8843, + "step": 3973 + }, + { + "epoch": 0.6738448495125053, + "grad_norm": 0.6213833628055527, + "learning_rate": 5.077889101253914e-06, + "loss": 0.8127, + "step": 3974 + }, + { + "epoch": 0.6740144128868164, + "grad_norm": 0.9796786216879492, + "learning_rate": 5.07310893373348e-06, + "loss": 0.9482, + "step": 3975 + }, + { + "epoch": 0.6741839762611276, + "grad_norm": 0.95104293414298, + "learning_rate": 5.068330252424614e-06, + "loss": 0.972, + "step": 3976 + }, + { + "epoch": 0.6743535396354388, + "grad_norm": 0.9939579073756265, + "learning_rate": 5.063553058768814e-06, + "loss": 0.9247, + "step": 3977 + }, + { + "epoch": 0.6745231030097499, + "grad_norm": 1.0022731705638037, + "learning_rate": 5.058777354207143e-06, + "loss": 0.9692, + "step": 3978 + }, + { + "epoch": 0.674692666384061, + "grad_norm": 0.9878107457283639, + "learning_rate": 5.054003140180204e-06, + "loss": 0.9712, + "step": 3979 + }, + { + "epoch": 0.6748622297583722, + "grad_norm": 1.0084259738394574, + "learning_rate": 5.049230418128153e-06, + "loss": 0.9405, + "step": 3980 + }, + { + "epoch": 0.6750317931326834, + "grad_norm": 1.0003879894729153, + "learning_rate": 5.044459189490694e-06, + "loss": 0.9778, + "step": 3981 + }, + { + "epoch": 0.6752013565069945, + "grad_norm": 0.936963536636467, + "learning_rate": 5.039689455707082e-06, + "loss": 0.9048, + "step": 3982 + }, + { + "epoch": 0.6753709198813056, + "grad_norm": 0.9662145325304197, + "learning_rate": 5.034921218216126e-06, + "loss": 0.9458, + "step": 3983 + }, + { + "epoch": 0.6755404832556168, + "grad_norm": 0.9676933985706532, + "learning_rate": 5.03015447845617e-06, + "loss": 0.9306, + "step": 3984 + }, + { + "epoch": 0.675710046629928, + "grad_norm": 0.9282665005479991, + "learning_rate": 5.025389237865128e-06, + "loss": 0.9175, + "step": 3985 + }, + { + "epoch": 0.6758796100042391, + "grad_norm": 0.9471878284834134, + "learning_rate": 5.020625497880444e-06, + "loss": 0.9411, + "step": 3986 + }, + { + "epoch": 0.6760491733785502, + "grad_norm": 1.0035344099246344, + "learning_rate": 5.0158632599391126e-06, + "loss": 0.9369, + "step": 3987 + }, + { + "epoch": 0.6762187367528614, + "grad_norm": 0.9433056947355806, + "learning_rate": 5.011102525477673e-06, + "loss": 0.9403, + "step": 3988 + }, + { + "epoch": 0.6763883001271725, + "grad_norm": 0.9211897102222225, + "learning_rate": 5.0063432959322265e-06, + "loss": 0.9135, + "step": 3989 + }, + { + "epoch": 0.6765578635014837, + "grad_norm": 0.6354653756596874, + "learning_rate": 5.001585572738403e-06, + "loss": 0.7484, + "step": 3990 + }, + { + "epoch": 0.6767274268757948, + "grad_norm": 0.9494161209946261, + "learning_rate": 4.9968293573313794e-06, + "loss": 0.9472, + "step": 3991 + }, + { + "epoch": 0.676896990250106, + "grad_norm": 0.9497001956446296, + "learning_rate": 4.992074651145892e-06, + "loss": 0.9113, + "step": 3992 + }, + { + "epoch": 0.6770665536244171, + "grad_norm": 0.9489970475232807, + "learning_rate": 4.987321455616206e-06, + "loss": 0.9114, + "step": 3993 + }, + { + "epoch": 0.6772361169987283, + "grad_norm": 0.9450701776997606, + "learning_rate": 4.98256977217614e-06, + "loss": 0.9347, + "step": 3994 + }, + { + "epoch": 0.6774056803730394, + "grad_norm": 0.9462792989384963, + "learning_rate": 4.977819602259048e-06, + "loss": 0.9152, + "step": 3995 + }, + { + "epoch": 0.6775752437473506, + "grad_norm": 0.9323501875106183, + "learning_rate": 4.973070947297841e-06, + "loss": 0.9519, + "step": 3996 + }, + { + "epoch": 0.6777448071216617, + "grad_norm": 0.9319045031764531, + "learning_rate": 4.968323808724962e-06, + "loss": 0.9195, + "step": 3997 + }, + { + "epoch": 0.6779143704959729, + "grad_norm": 0.9739140404015078, + "learning_rate": 4.963578187972399e-06, + "loss": 0.9584, + "step": 3998 + }, + { + "epoch": 0.678083933870284, + "grad_norm": 0.9760266456278085, + "learning_rate": 4.958834086471683e-06, + "loss": 0.9434, + "step": 3999 + }, + { + "epoch": 0.6782534972445952, + "grad_norm": 0.9733085620633999, + "learning_rate": 4.954091505653886e-06, + "loss": 0.941, + "step": 4000 + }, + { + "epoch": 0.6784230606189063, + "grad_norm": 0.9567129090022204, + "learning_rate": 4.9493504469496235e-06, + "loss": 0.9273, + "step": 4001 + }, + { + "epoch": 0.6785926239932175, + "grad_norm": 0.9706521020117985, + "learning_rate": 4.9446109117890454e-06, + "loss": 0.9636, + "step": 4002 + }, + { + "epoch": 0.6787621873675286, + "grad_norm": 0.9499771902042635, + "learning_rate": 4.939872901601853e-06, + "loss": 0.9416, + "step": 4003 + }, + { + "epoch": 0.6789317507418398, + "grad_norm": 0.9816643890586689, + "learning_rate": 4.93513641781728e-06, + "loss": 0.9289, + "step": 4004 + }, + { + "epoch": 0.6791013141161509, + "grad_norm": 0.9132367691792639, + "learning_rate": 4.930401461864099e-06, + "loss": 0.9439, + "step": 4005 + }, + { + "epoch": 0.6792708774904621, + "grad_norm": 0.9285855135124447, + "learning_rate": 4.925668035170622e-06, + "loss": 0.9179, + "step": 4006 + }, + { + "epoch": 0.6794404408647732, + "grad_norm": 0.949273096146644, + "learning_rate": 4.920936139164707e-06, + "loss": 0.9274, + "step": 4007 + }, + { + "epoch": 0.6796100042390844, + "grad_norm": 1.002864137269556, + "learning_rate": 4.9162057752737415e-06, + "loss": 0.9536, + "step": 4008 + }, + { + "epoch": 0.6797795676133955, + "grad_norm": 0.9445691271730213, + "learning_rate": 4.911476944924651e-06, + "loss": 0.9107, + "step": 4009 + }, + { + "epoch": 0.6799491309877067, + "grad_norm": 0.9451835108284893, + "learning_rate": 4.9067496495439095e-06, + "loss": 0.9633, + "step": 4010 + }, + { + "epoch": 0.6801186943620178, + "grad_norm": 0.9507759125434737, + "learning_rate": 4.9020238905575136e-06, + "loss": 0.9548, + "step": 4011 + }, + { + "epoch": 0.680288257736329, + "grad_norm": 0.9403580655731887, + "learning_rate": 4.897299669391006e-06, + "loss": 0.9596, + "step": 4012 + }, + { + "epoch": 0.6804578211106401, + "grad_norm": 0.9322524026187559, + "learning_rate": 4.892576987469456e-06, + "loss": 0.9152, + "step": 4013 + }, + { + "epoch": 0.6806273844849513, + "grad_norm": 0.9988271523318749, + "learning_rate": 4.887855846217483e-06, + "loss": 0.9324, + "step": 4014 + }, + { + "epoch": 0.6807969478592624, + "grad_norm": 1.0105629334783448, + "learning_rate": 4.883136247059231e-06, + "loss": 0.9068, + "step": 4015 + }, + { + "epoch": 0.6809665112335735, + "grad_norm": 0.9664817820568461, + "learning_rate": 4.87841819141838e-06, + "loss": 0.9226, + "step": 4016 + }, + { + "epoch": 0.6811360746078847, + "grad_norm": 0.9537555856169774, + "learning_rate": 4.873701680718146e-06, + "loss": 0.9326, + "step": 4017 + }, + { + "epoch": 0.6813056379821959, + "grad_norm": 0.9350691794900546, + "learning_rate": 4.868986716381279e-06, + "loss": 0.9178, + "step": 4018 + }, + { + "epoch": 0.681475201356507, + "grad_norm": 0.9737372893166677, + "learning_rate": 4.8642732998300575e-06, + "loss": 0.8968, + "step": 4019 + }, + { + "epoch": 0.6816447647308181, + "grad_norm": 0.9431492296282682, + "learning_rate": 4.859561432486307e-06, + "loss": 0.9153, + "step": 4020 + }, + { + "epoch": 0.6818143281051293, + "grad_norm": 0.9570381445780026, + "learning_rate": 4.854851115771373e-06, + "loss": 0.9437, + "step": 4021 + }, + { + "epoch": 0.6819838914794405, + "grad_norm": 0.9815747583401702, + "learning_rate": 4.8501423511061344e-06, + "loss": 0.9667, + "step": 4022 + }, + { + "epoch": 0.6821534548537516, + "grad_norm": 0.948418824259176, + "learning_rate": 4.845435139911006e-06, + "loss": 0.9733, + "step": 4023 + }, + { + "epoch": 0.6823230182280627, + "grad_norm": 0.9547014526082412, + "learning_rate": 4.840729483605927e-06, + "loss": 0.9084, + "step": 4024 + }, + { + "epoch": 0.6824925816023739, + "grad_norm": 0.935289539696172, + "learning_rate": 4.836025383610382e-06, + "loss": 0.9314, + "step": 4025 + }, + { + "epoch": 0.682662144976685, + "grad_norm": 0.940694533779955, + "learning_rate": 4.8313228413433736e-06, + "loss": 0.9207, + "step": 4026 + }, + { + "epoch": 0.6828317083509962, + "grad_norm": 0.934832886052752, + "learning_rate": 4.826621858223431e-06, + "loss": 0.946, + "step": 4027 + }, + { + "epoch": 0.6830012717253073, + "grad_norm": 0.9528890211506568, + "learning_rate": 4.821922435668631e-06, + "loss": 0.9342, + "step": 4028 + }, + { + "epoch": 0.6831708350996185, + "grad_norm": 1.00398965594739, + "learning_rate": 4.817224575096564e-06, + "loss": 0.9447, + "step": 4029 + }, + { + "epoch": 0.6833403984739296, + "grad_norm": 0.9837162458501686, + "learning_rate": 4.812528277924352e-06, + "loss": 0.9289, + "step": 4030 + }, + { + "epoch": 0.6835099618482408, + "grad_norm": 0.9027343515968713, + "learning_rate": 4.807833545568645e-06, + "loss": 0.8989, + "step": 4031 + }, + { + "epoch": 0.6836795252225519, + "grad_norm": 0.9548087488370355, + "learning_rate": 4.803140379445632e-06, + "loss": 0.9549, + "step": 4032 + }, + { + "epoch": 0.6838490885968631, + "grad_norm": 1.0098880536957924, + "learning_rate": 4.798448780971013e-06, + "loss": 0.9635, + "step": 4033 + }, + { + "epoch": 0.6840186519711742, + "grad_norm": 0.9303449195787494, + "learning_rate": 4.793758751560027e-06, + "loss": 0.9243, + "step": 4034 + }, + { + "epoch": 0.6841882153454854, + "grad_norm": 1.005782356068551, + "learning_rate": 4.78907029262743e-06, + "loss": 0.9655, + "step": 4035 + }, + { + "epoch": 0.6843577787197965, + "grad_norm": 0.9410591862548884, + "learning_rate": 4.7843834055875174e-06, + "loss": 0.9415, + "step": 4036 + }, + { + "epoch": 0.6845273420941077, + "grad_norm": 0.9331602591336416, + "learning_rate": 4.779698091854098e-06, + "loss": 0.9266, + "step": 4037 + }, + { + "epoch": 0.6846969054684188, + "grad_norm": 0.9312350157191608, + "learning_rate": 4.775014352840512e-06, + "loss": 0.9085, + "step": 4038 + }, + { + "epoch": 0.68486646884273, + "grad_norm": 0.9552761689816897, + "learning_rate": 4.7703321899596245e-06, + "loss": 0.9194, + "step": 4039 + }, + { + "epoch": 0.6850360322170411, + "grad_norm": 1.0293540654836395, + "learning_rate": 4.765651604623822e-06, + "loss": 0.9643, + "step": 4040 + }, + { + "epoch": 0.6852055955913523, + "grad_norm": 0.9597569051150278, + "learning_rate": 4.7609725982450176e-06, + "loss": 0.9128, + "step": 4041 + }, + { + "epoch": 0.6853751589656634, + "grad_norm": 0.9779729649048823, + "learning_rate": 4.7562951722346454e-06, + "loss": 0.9863, + "step": 4042 + }, + { + "epoch": 0.6855447223399745, + "grad_norm": 1.0350634625397963, + "learning_rate": 4.75161932800367e-06, + "loss": 0.9697, + "step": 4043 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 1.0261218050435719, + "learning_rate": 4.746945066962574e-06, + "loss": 0.9943, + "step": 4044 + }, + { + "epoch": 0.6858838490885969, + "grad_norm": 0.9667322065147755, + "learning_rate": 4.742272390521354e-06, + "loss": 0.9494, + "step": 4045 + }, + { + "epoch": 0.686053412462908, + "grad_norm": 0.9977838482756628, + "learning_rate": 4.7376013000895486e-06, + "loss": 0.9483, + "step": 4046 + }, + { + "epoch": 0.6862229758372191, + "grad_norm": 0.9700314488244718, + "learning_rate": 4.7329317970762e-06, + "loss": 0.9424, + "step": 4047 + }, + { + "epoch": 0.6863925392115303, + "grad_norm": 0.9567437917274273, + "learning_rate": 4.728263882889879e-06, + "loss": 0.8974, + "step": 4048 + }, + { + "epoch": 0.6865621025858415, + "grad_norm": 0.9334437037773666, + "learning_rate": 4.7235975589386715e-06, + "loss": 0.9099, + "step": 4049 + }, + { + "epoch": 0.6867316659601526, + "grad_norm": 0.9219998381798292, + "learning_rate": 4.718932826630197e-06, + "loss": 0.9159, + "step": 4050 + }, + { + "epoch": 0.6869012293344637, + "grad_norm": 0.9995395086385241, + "learning_rate": 4.714269687371581e-06, + "loss": 0.9386, + "step": 4051 + }, + { + "epoch": 0.6870707927087749, + "grad_norm": 0.9716738882772417, + "learning_rate": 4.709608142569474e-06, + "loss": 0.96, + "step": 4052 + }, + { + "epoch": 0.6872403560830861, + "grad_norm": 0.9522885810932386, + "learning_rate": 4.704948193630041e-06, + "loss": 0.9501, + "step": 4053 + }, + { + "epoch": 0.6874099194573972, + "grad_norm": 0.9649104683705396, + "learning_rate": 4.700289841958978e-06, + "loss": 0.9394, + "step": 4054 + }, + { + "epoch": 0.6875794828317083, + "grad_norm": 0.9791398478386767, + "learning_rate": 4.695633088961487e-06, + "loss": 0.9564, + "step": 4055 + }, + { + "epoch": 0.6877490462060195, + "grad_norm": 0.9818424553511681, + "learning_rate": 4.69097793604229e-06, + "loss": 0.9345, + "step": 4056 + }, + { + "epoch": 0.6879186095803307, + "grad_norm": 0.9552428815190235, + "learning_rate": 4.686324384605629e-06, + "loss": 0.9633, + "step": 4057 + }, + { + "epoch": 0.6880881729546418, + "grad_norm": 0.952707154493505, + "learning_rate": 4.681672436055264e-06, + "loss": 0.9428, + "step": 4058 + }, + { + "epoch": 0.6882577363289529, + "grad_norm": 0.9446346804687629, + "learning_rate": 4.677022091794466e-06, + "loss": 0.9384, + "step": 4059 + }, + { + "epoch": 0.6884272997032641, + "grad_norm": 0.9500786158942087, + "learning_rate": 4.672373353226023e-06, + "loss": 0.9169, + "step": 4060 + }, + { + "epoch": 0.6885968630775753, + "grad_norm": 0.9415731167708214, + "learning_rate": 4.667726221752249e-06, + "loss": 0.9384, + "step": 4061 + }, + { + "epoch": 0.6887664264518863, + "grad_norm": 0.9247989773843499, + "learning_rate": 4.66308069877496e-06, + "loss": 0.9167, + "step": 4062 + }, + { + "epoch": 0.6889359898261975, + "grad_norm": 0.9848331758397039, + "learning_rate": 4.65843678569549e-06, + "loss": 0.9381, + "step": 4063 + }, + { + "epoch": 0.6891055532005087, + "grad_norm": 1.0036892412422784, + "learning_rate": 4.653794483914696e-06, + "loss": 0.9735, + "step": 4064 + }, + { + "epoch": 0.6892751165748199, + "grad_norm": 0.9314602082204769, + "learning_rate": 4.649153794832939e-06, + "loss": 0.8917, + "step": 4065 + }, + { + "epoch": 0.689444679949131, + "grad_norm": 0.9659070506247437, + "learning_rate": 4.6445147198500965e-06, + "loss": 0.9307, + "step": 4066 + }, + { + "epoch": 0.6896142433234421, + "grad_norm": 0.9427510745367279, + "learning_rate": 4.639877260365555e-06, + "loss": 0.9347, + "step": 4067 + }, + { + "epoch": 0.6897838066977533, + "grad_norm": 0.9690135435204741, + "learning_rate": 4.6352414177782275e-06, + "loss": 0.9507, + "step": 4068 + }, + { + "epoch": 0.6899533700720645, + "grad_norm": 0.9586304638312999, + "learning_rate": 4.630607193486525e-06, + "loss": 0.9544, + "step": 4069 + }, + { + "epoch": 0.6901229334463755, + "grad_norm": 0.9537173228723301, + "learning_rate": 4.6259745888883715e-06, + "loss": 0.9481, + "step": 4070 + }, + { + "epoch": 0.6902924968206867, + "grad_norm": 1.0011604447237028, + "learning_rate": 4.621343605381215e-06, + "loss": 0.9253, + "step": 4071 + }, + { + "epoch": 0.6904620601949979, + "grad_norm": 0.606128128513411, + "learning_rate": 4.616714244361998e-06, + "loss": 0.7541, + "step": 4072 + }, + { + "epoch": 0.6906316235693091, + "grad_norm": 0.9916724970754378, + "learning_rate": 4.612086507227186e-06, + "loss": 0.9287, + "step": 4073 + }, + { + "epoch": 0.6908011869436201, + "grad_norm": 0.9658308613972185, + "learning_rate": 4.607460395372748e-06, + "loss": 0.9046, + "step": 4074 + }, + { + "epoch": 0.6909707503179313, + "grad_norm": 0.9496397497985856, + "learning_rate": 4.602835910194165e-06, + "loss": 0.9406, + "step": 4075 + }, + { + "epoch": 0.6911403136922425, + "grad_norm": 0.9403396277053561, + "learning_rate": 4.5982130530864246e-06, + "loss": 0.9306, + "step": 4076 + }, + { + "epoch": 0.6913098770665537, + "grad_norm": 1.0089561930508963, + "learning_rate": 4.593591825444028e-06, + "loss": 0.9829, + "step": 4077 + }, + { + "epoch": 0.6914794404408647, + "grad_norm": 0.9573769049109759, + "learning_rate": 4.588972228660978e-06, + "loss": 0.9344, + "step": 4078 + }, + { + "epoch": 0.6916490038151759, + "grad_norm": 0.9961316318289198, + "learning_rate": 4.584354264130798e-06, + "loss": 0.9705, + "step": 4079 + }, + { + "epoch": 0.6918185671894871, + "grad_norm": 0.9538864166686905, + "learning_rate": 4.579737933246507e-06, + "loss": 0.9264, + "step": 4080 + }, + { + "epoch": 0.6919881305637983, + "grad_norm": 1.0594052832439238, + "learning_rate": 4.5751232374006304e-06, + "loss": 0.9723, + "step": 4081 + }, + { + "epoch": 0.6921576939381093, + "grad_norm": 1.0225462666165246, + "learning_rate": 4.570510177985213e-06, + "loss": 0.9365, + "step": 4082 + }, + { + "epoch": 0.6923272573124205, + "grad_norm": 0.979872304974286, + "learning_rate": 4.565898756391797e-06, + "loss": 0.9765, + "step": 4083 + }, + { + "epoch": 0.6924968206867317, + "grad_norm": 0.977345335866861, + "learning_rate": 4.561288974011427e-06, + "loss": 0.9771, + "step": 4084 + }, + { + "epoch": 0.6926663840610429, + "grad_norm": 0.9432590713027864, + "learning_rate": 4.556680832234657e-06, + "loss": 0.919, + "step": 4085 + }, + { + "epoch": 0.6928359474353539, + "grad_norm": 1.0003878223159666, + "learning_rate": 4.552074332451554e-06, + "loss": 0.9459, + "step": 4086 + }, + { + "epoch": 0.6930055108096651, + "grad_norm": 0.9477586608085584, + "learning_rate": 4.547469476051679e-06, + "loss": 0.9342, + "step": 4087 + }, + { + "epoch": 0.6931750741839763, + "grad_norm": 0.9344744856227825, + "learning_rate": 4.5428662644240964e-06, + "loss": 0.9044, + "step": 4088 + }, + { + "epoch": 0.6933446375582875, + "grad_norm": 0.9543062096168511, + "learning_rate": 4.538264698957387e-06, + "loss": 0.9544, + "step": 4089 + }, + { + "epoch": 0.6935142009325985, + "grad_norm": 0.9676621165038252, + "learning_rate": 4.533664781039622e-06, + "loss": 0.9116, + "step": 4090 + }, + { + "epoch": 0.6936837643069097, + "grad_norm": 0.9934313644010448, + "learning_rate": 4.529066512058381e-06, + "loss": 0.9266, + "step": 4091 + }, + { + "epoch": 0.6938533276812209, + "grad_norm": 0.9572559231416885, + "learning_rate": 4.524469893400747e-06, + "loss": 0.9238, + "step": 4092 + }, + { + "epoch": 0.694022891055532, + "grad_norm": 1.0154507875892895, + "learning_rate": 4.519874926453303e-06, + "loss": 0.9527, + "step": 4093 + }, + { + "epoch": 0.6941924544298431, + "grad_norm": 0.962893146822662, + "learning_rate": 4.515281612602134e-06, + "loss": 0.9439, + "step": 4094 + }, + { + "epoch": 0.6943620178041543, + "grad_norm": 0.9918397434038865, + "learning_rate": 4.5106899532328275e-06, + "loss": 0.9712, + "step": 4095 + }, + { + "epoch": 0.6945315811784655, + "grad_norm": 0.9742779924742221, + "learning_rate": 4.506099949730468e-06, + "loss": 0.9478, + "step": 4096 + }, + { + "epoch": 0.6947011445527767, + "grad_norm": 0.9533009545609286, + "learning_rate": 4.501511603479653e-06, + "loss": 0.9148, + "step": 4097 + }, + { + "epoch": 0.6948707079270877, + "grad_norm": 1.0110241294092543, + "learning_rate": 4.496924915864463e-06, + "loss": 0.9688, + "step": 4098 + }, + { + "epoch": 0.6950402713013989, + "grad_norm": 0.9731660788504265, + "learning_rate": 4.492339888268486e-06, + "loss": 0.9285, + "step": 4099 + }, + { + "epoch": 0.6952098346757101, + "grad_norm": 0.9988175650309414, + "learning_rate": 4.487756522074815e-06, + "loss": 0.9361, + "step": 4100 + }, + { + "epoch": 0.6953793980500212, + "grad_norm": 1.0374573746225664, + "learning_rate": 4.483174818666034e-06, + "loss": 0.9341, + "step": 4101 + }, + { + "epoch": 0.6955489614243323, + "grad_norm": 0.9951237006938043, + "learning_rate": 4.478594779424227e-06, + "loss": 0.9262, + "step": 4102 + }, + { + "epoch": 0.6957185247986435, + "grad_norm": 0.9948782227176328, + "learning_rate": 4.474016405730973e-06, + "loss": 0.9455, + "step": 4103 + }, + { + "epoch": 0.6958880881729547, + "grad_norm": 0.9689389842245486, + "learning_rate": 4.46943969896736e-06, + "loss": 0.8939, + "step": 4104 + }, + { + "epoch": 0.6960576515472658, + "grad_norm": 0.9022908704420627, + "learning_rate": 4.4648646605139605e-06, + "loss": 0.9229, + "step": 4105 + }, + { + "epoch": 0.6962272149215769, + "grad_norm": 0.9716613458005243, + "learning_rate": 4.460291291750846e-06, + "loss": 0.9393, + "step": 4106 + }, + { + "epoch": 0.6963967782958881, + "grad_norm": 0.9643208714797908, + "learning_rate": 4.455719594057594e-06, + "loss": 0.9213, + "step": 4107 + }, + { + "epoch": 0.6965663416701993, + "grad_norm": 0.960324433673699, + "learning_rate": 4.4511495688132675e-06, + "loss": 0.9324, + "step": 4108 + }, + { + "epoch": 0.6967359050445104, + "grad_norm": 0.9878668451522663, + "learning_rate": 4.446581217396428e-06, + "loss": 0.9902, + "step": 4109 + }, + { + "epoch": 0.6969054684188215, + "grad_norm": 0.640666123070321, + "learning_rate": 4.4420145411851336e-06, + "loss": 0.7824, + "step": 4110 + }, + { + "epoch": 0.6970750317931327, + "grad_norm": 1.0478185175607173, + "learning_rate": 4.4374495415569344e-06, + "loss": 0.9648, + "step": 4111 + }, + { + "epoch": 0.6972445951674439, + "grad_norm": 0.945031422944879, + "learning_rate": 4.432886219888877e-06, + "loss": 0.9043, + "step": 4112 + }, + { + "epoch": 0.697414158541755, + "grad_norm": 0.9934272627783942, + "learning_rate": 4.428324577557501e-06, + "loss": 0.9643, + "step": 4113 + }, + { + "epoch": 0.6975837219160661, + "grad_norm": 0.9352222250679258, + "learning_rate": 4.423764615938837e-06, + "loss": 0.9139, + "step": 4114 + }, + { + "epoch": 0.6977532852903773, + "grad_norm": 1.0129476589521536, + "learning_rate": 4.419206336408418e-06, + "loss": 0.9602, + "step": 4115 + }, + { + "epoch": 0.6979228486646885, + "grad_norm": 1.0157424247962692, + "learning_rate": 4.414649740341258e-06, + "loss": 0.9897, + "step": 4116 + }, + { + "epoch": 0.6980924120389995, + "grad_norm": 0.9589930630789485, + "learning_rate": 4.410094829111865e-06, + "loss": 0.9466, + "step": 4117 + }, + { + "epoch": 0.6982619754133107, + "grad_norm": 0.9687373417271908, + "learning_rate": 4.405541604094249e-06, + "loss": 0.9523, + "step": 4118 + }, + { + "epoch": 0.6984315387876219, + "grad_norm": 0.923766337216128, + "learning_rate": 4.400990066661901e-06, + "loss": 0.923, + "step": 4119 + }, + { + "epoch": 0.698601102161933, + "grad_norm": 0.9967661891888636, + "learning_rate": 4.396440218187805e-06, + "loss": 0.9403, + "step": 4120 + }, + { + "epoch": 0.6987706655362441, + "grad_norm": 0.9545476718469557, + "learning_rate": 4.391892060044435e-06, + "loss": 0.9319, + "step": 4121 + }, + { + "epoch": 0.6989402289105553, + "grad_norm": 0.9832085814697785, + "learning_rate": 4.387345593603761e-06, + "loss": 0.9283, + "step": 4122 + }, + { + "epoch": 0.6991097922848665, + "grad_norm": 0.9739904929116864, + "learning_rate": 4.382800820237236e-06, + "loss": 0.9331, + "step": 4123 + }, + { + "epoch": 0.6992793556591776, + "grad_norm": 1.0174371909615483, + "learning_rate": 4.378257741315801e-06, + "loss": 0.9467, + "step": 4124 + }, + { + "epoch": 0.6994489190334887, + "grad_norm": 0.9791070899328154, + "learning_rate": 4.373716358209898e-06, + "loss": 0.9342, + "step": 4125 + }, + { + "epoch": 0.6996184824077999, + "grad_norm": 0.947486344414689, + "learning_rate": 4.3691766722894435e-06, + "loss": 0.9343, + "step": 4126 + }, + { + "epoch": 0.6997880457821111, + "grad_norm": 1.057983954370814, + "learning_rate": 4.364638684923848e-06, + "loss": 0.9316, + "step": 4127 + }, + { + "epoch": 0.6999576091564222, + "grad_norm": 0.9518314758532409, + "learning_rate": 4.360102397482008e-06, + "loss": 0.9521, + "step": 4128 + }, + { + "epoch": 0.7001271725307333, + "grad_norm": 1.0085011926152079, + "learning_rate": 4.355567811332311e-06, + "loss": 0.9575, + "step": 4129 + }, + { + "epoch": 0.7002967359050445, + "grad_norm": 0.9389350583902506, + "learning_rate": 4.3510349278426255e-06, + "loss": 0.9143, + "step": 4130 + }, + { + "epoch": 0.7004662992793557, + "grad_norm": 0.9655126671833357, + "learning_rate": 4.346503748380312e-06, + "loss": 0.9268, + "step": 4131 + }, + { + "epoch": 0.7006358626536668, + "grad_norm": 0.9671727341603766, + "learning_rate": 4.341974274312211e-06, + "loss": 0.9534, + "step": 4132 + }, + { + "epoch": 0.7008054260279779, + "grad_norm": 1.0177605408277712, + "learning_rate": 4.337446507004656e-06, + "loss": 0.9637, + "step": 4133 + }, + { + "epoch": 0.7009749894022891, + "grad_norm": 0.9694668859123609, + "learning_rate": 4.332920447823461e-06, + "loss": 0.9556, + "step": 4134 + }, + { + "epoch": 0.7011445527766003, + "grad_norm": 0.939842372498322, + "learning_rate": 4.328396098133921e-06, + "loss": 0.9304, + "step": 4135 + }, + { + "epoch": 0.7013141161509114, + "grad_norm": 0.9698516766666061, + "learning_rate": 4.323873459300827e-06, + "loss": 0.9428, + "step": 4136 + }, + { + "epoch": 0.7014836795252225, + "grad_norm": 1.0131866026257155, + "learning_rate": 4.319352532688444e-06, + "loss": 0.9612, + "step": 4137 + }, + { + "epoch": 0.7016532428995337, + "grad_norm": 0.9213948798612626, + "learning_rate": 4.3148333196605205e-06, + "loss": 0.8951, + "step": 4138 + }, + { + "epoch": 0.7018228062738449, + "grad_norm": 0.9898315603593233, + "learning_rate": 4.31031582158029e-06, + "loss": 0.9471, + "step": 4139 + }, + { + "epoch": 0.701992369648156, + "grad_norm": 1.0281437925468315, + "learning_rate": 4.305800039810475e-06, + "loss": 0.9685, + "step": 4140 + }, + { + "epoch": 0.7021619330224671, + "grad_norm": 0.9653305196770438, + "learning_rate": 4.3012859757132715e-06, + "loss": 0.9578, + "step": 4141 + }, + { + "epoch": 0.7023314963967783, + "grad_norm": 0.9486425375680066, + "learning_rate": 4.296773630650358e-06, + "loss": 0.9582, + "step": 4142 + }, + { + "epoch": 0.7025010597710895, + "grad_norm": 0.9369857148552346, + "learning_rate": 4.292263005982903e-06, + "loss": 0.9233, + "step": 4143 + }, + { + "epoch": 0.7026706231454006, + "grad_norm": 0.9555449712846975, + "learning_rate": 4.287754103071545e-06, + "loss": 0.9455, + "step": 4144 + }, + { + "epoch": 0.7028401865197117, + "grad_norm": 0.8779842152695021, + "learning_rate": 4.283246923276411e-06, + "loss": 0.9273, + "step": 4145 + }, + { + "epoch": 0.7030097498940229, + "grad_norm": 0.9494631264981629, + "learning_rate": 4.278741467957105e-06, + "loss": 0.9258, + "step": 4146 + }, + { + "epoch": 0.703179313268334, + "grad_norm": 0.9620081558373502, + "learning_rate": 4.2742377384727104e-06, + "loss": 0.9199, + "step": 4147 + }, + { + "epoch": 0.7033488766426452, + "grad_norm": 1.0172654363224458, + "learning_rate": 4.26973573618179e-06, + "loss": 0.9502, + "step": 4148 + }, + { + "epoch": 0.7035184400169563, + "grad_norm": 0.9643114529200683, + "learning_rate": 4.265235462442389e-06, + "loss": 0.921, + "step": 4149 + }, + { + "epoch": 0.7036880033912675, + "grad_norm": 0.9698599128661651, + "learning_rate": 4.26073691861202e-06, + "loss": 0.9361, + "step": 4150 + }, + { + "epoch": 0.7038575667655786, + "grad_norm": 0.9616511043361519, + "learning_rate": 4.256240106047695e-06, + "loss": 0.9246, + "step": 4151 + }, + { + "epoch": 0.7040271301398898, + "grad_norm": 1.0370304485116948, + "learning_rate": 4.251745026105886e-06, + "loss": 0.9334, + "step": 4152 + }, + { + "epoch": 0.7041966935142009, + "grad_norm": 0.6341604999148488, + "learning_rate": 4.247251680142542e-06, + "loss": 0.7943, + "step": 4153 + }, + { + "epoch": 0.7043662568885121, + "grad_norm": 0.9366874837174215, + "learning_rate": 4.242760069513103e-06, + "loss": 0.9024, + "step": 4154 + }, + { + "epoch": 0.7045358202628232, + "grad_norm": 0.931439143836383, + "learning_rate": 4.2382701955724724e-06, + "loss": 0.9525, + "step": 4155 + }, + { + "epoch": 0.7047053836371344, + "grad_norm": 0.9322017515907924, + "learning_rate": 4.2337820596750356e-06, + "loss": 0.9011, + "step": 4156 + }, + { + "epoch": 0.7048749470114455, + "grad_norm": 0.9437749659761931, + "learning_rate": 4.2292956631746475e-06, + "loss": 0.9476, + "step": 4157 + }, + { + "epoch": 0.7050445103857567, + "grad_norm": 0.9792416037230323, + "learning_rate": 4.224811007424651e-06, + "loss": 0.9594, + "step": 4158 + }, + { + "epoch": 0.7052140737600678, + "grad_norm": 0.9445075750706137, + "learning_rate": 4.220328093777851e-06, + "loss": 0.9207, + "step": 4159 + }, + { + "epoch": 0.705383637134379, + "grad_norm": 0.9763483957570636, + "learning_rate": 4.215846923586531e-06, + "loss": 0.9357, + "step": 4160 + }, + { + "epoch": 0.7055532005086901, + "grad_norm": 0.9728893824852156, + "learning_rate": 4.211367498202456e-06, + "loss": 0.9892, + "step": 4161 + }, + { + "epoch": 0.7057227638830013, + "grad_norm": 0.9674165888196323, + "learning_rate": 4.206889818976852e-06, + "loss": 0.947, + "step": 4162 + }, + { + "epoch": 0.7058923272573124, + "grad_norm": 1.1758764590361632, + "learning_rate": 4.202413887260427e-06, + "loss": 0.876, + "step": 4163 + }, + { + "epoch": 0.7060618906316236, + "grad_norm": 0.9167617302696355, + "learning_rate": 4.197939704403359e-06, + "loss": 0.9004, + "step": 4164 + }, + { + "epoch": 0.7062314540059347, + "grad_norm": 0.9766285525720196, + "learning_rate": 4.1934672717552986e-06, + "loss": 0.9459, + "step": 4165 + }, + { + "epoch": 0.7064010173802459, + "grad_norm": 0.9314903296113407, + "learning_rate": 4.188996590665369e-06, + "loss": 0.9081, + "step": 4166 + }, + { + "epoch": 0.706570580754557, + "grad_norm": 0.9419043377450395, + "learning_rate": 4.184527662482158e-06, + "loss": 0.9376, + "step": 4167 + }, + { + "epoch": 0.7067401441288682, + "grad_norm": 0.622738579444429, + "learning_rate": 4.180060488553743e-06, + "loss": 0.779, + "step": 4168 + }, + { + "epoch": 0.7069097075031793, + "grad_norm": 1.0304125084468243, + "learning_rate": 4.175595070227655e-06, + "loss": 0.9695, + "step": 4169 + }, + { + "epoch": 0.7070792708774905, + "grad_norm": 0.9758563592519464, + "learning_rate": 4.171131408850901e-06, + "loss": 0.9463, + "step": 4170 + }, + { + "epoch": 0.7072488342518016, + "grad_norm": 1.0184915596143123, + "learning_rate": 4.166669505769954e-06, + "loss": 0.9802, + "step": 4171 + }, + { + "epoch": 0.7074183976261128, + "grad_norm": 0.980196598961895, + "learning_rate": 4.162209362330767e-06, + "loss": 0.925, + "step": 4172 + }, + { + "epoch": 0.7075879610004239, + "grad_norm": 0.9800256385418338, + "learning_rate": 4.157750979878753e-06, + "loss": 0.9414, + "step": 4173 + }, + { + "epoch": 0.707757524374735, + "grad_norm": 0.9418468210967157, + "learning_rate": 4.153294359758797e-06, + "loss": 0.9636, + "step": 4174 + }, + { + "epoch": 0.7079270877490462, + "grad_norm": 0.9314265038957935, + "learning_rate": 4.1488395033152485e-06, + "loss": 0.9554, + "step": 4175 + }, + { + "epoch": 0.7080966511233574, + "grad_norm": 0.9444662071662503, + "learning_rate": 4.144386411891934e-06, + "loss": 0.9226, + "step": 4176 + }, + { + "epoch": 0.7082662144976685, + "grad_norm": 1.0135840109264243, + "learning_rate": 4.13993508683214e-06, + "loss": 0.9648, + "step": 4177 + }, + { + "epoch": 0.7084357778719796, + "grad_norm": 0.9530315918562487, + "learning_rate": 4.135485529478618e-06, + "loss": 0.9318, + "step": 4178 + }, + { + "epoch": 0.7086053412462908, + "grad_norm": 0.9359676306992604, + "learning_rate": 4.131037741173597e-06, + "loss": 0.9474, + "step": 4179 + }, + { + "epoch": 0.708774904620602, + "grad_norm": 0.9746231410435219, + "learning_rate": 4.126591723258763e-06, + "loss": 0.9373, + "step": 4180 + }, + { + "epoch": 0.7089444679949131, + "grad_norm": 0.97016522873043, + "learning_rate": 4.12214747707527e-06, + "loss": 0.9626, + "step": 4181 + }, + { + "epoch": 0.7091140313692242, + "grad_norm": 0.9463430623400254, + "learning_rate": 4.117705003963739e-06, + "loss": 0.9213, + "step": 4182 + }, + { + "epoch": 0.7092835947435354, + "grad_norm": 0.9994084124631185, + "learning_rate": 4.113264305264254e-06, + "loss": 0.951, + "step": 4183 + }, + { + "epoch": 0.7094531581178466, + "grad_norm": 0.9683313041858357, + "learning_rate": 4.108825382316368e-06, + "loss": 0.9449, + "step": 4184 + }, + { + "epoch": 0.7096227214921577, + "grad_norm": 0.9685371183679886, + "learning_rate": 4.1043882364590895e-06, + "loss": 0.8995, + "step": 4185 + }, + { + "epoch": 0.7097922848664688, + "grad_norm": 0.9210548612970206, + "learning_rate": 4.099952869030905e-06, + "loss": 0.9282, + "step": 4186 + }, + { + "epoch": 0.70996184824078, + "grad_norm": 0.9859523775194875, + "learning_rate": 4.095519281369752e-06, + "loss": 0.9412, + "step": 4187 + }, + { + "epoch": 0.7101314116150912, + "grad_norm": 0.954357388180877, + "learning_rate": 4.091087474813037e-06, + "loss": 0.9674, + "step": 4188 + }, + { + "epoch": 0.7103009749894023, + "grad_norm": 0.9756277134438451, + "learning_rate": 4.086657450697623e-06, + "loss": 0.937, + "step": 4189 + }, + { + "epoch": 0.7104705383637134, + "grad_norm": 0.946612553294586, + "learning_rate": 4.082229210359848e-06, + "loss": 0.9051, + "step": 4190 + }, + { + "epoch": 0.7106401017380246, + "grad_norm": 1.001286330392442, + "learning_rate": 4.077802755135501e-06, + "loss": 0.9782, + "step": 4191 + }, + { + "epoch": 0.7108096651123358, + "grad_norm": 0.9896694019979584, + "learning_rate": 4.073378086359834e-06, + "loss": 0.9254, + "step": 4192 + }, + { + "epoch": 0.7109792284866469, + "grad_norm": 1.028395901818606, + "learning_rate": 4.068955205367559e-06, + "loss": 0.9152, + "step": 4193 + }, + { + "epoch": 0.711148791860958, + "grad_norm": 0.9484471453245332, + "learning_rate": 4.064534113492861e-06, + "loss": 0.9222, + "step": 4194 + }, + { + "epoch": 0.7113183552352692, + "grad_norm": 0.9737297671516091, + "learning_rate": 4.060114812069367e-06, + "loss": 0.9466, + "step": 4195 + }, + { + "epoch": 0.7114879186095804, + "grad_norm": 0.9269985021001064, + "learning_rate": 4.055697302430173e-06, + "loss": 0.9091, + "step": 4196 + }, + { + "epoch": 0.7116574819838914, + "grad_norm": 0.9585716454557712, + "learning_rate": 4.051281585907841e-06, + "loss": 0.9549, + "step": 4197 + }, + { + "epoch": 0.7118270453582026, + "grad_norm": 0.9870018512553761, + "learning_rate": 4.0468676638343786e-06, + "loss": 0.914, + "step": 4198 + }, + { + "epoch": 0.7119966087325138, + "grad_norm": 0.9602709908372734, + "learning_rate": 4.0424555375412615e-06, + "loss": 0.9166, + "step": 4199 + }, + { + "epoch": 0.712166172106825, + "grad_norm": 0.9879011009436877, + "learning_rate": 4.038045208359421e-06, + "loss": 0.9411, + "step": 4200 + }, + { + "epoch": 0.712335735481136, + "grad_norm": 1.0062119932861266, + "learning_rate": 4.033636677619242e-06, + "loss": 0.9554, + "step": 4201 + }, + { + "epoch": 0.7125052988554472, + "grad_norm": 0.981331343143804, + "learning_rate": 4.029229946650577e-06, + "loss": 0.9276, + "step": 4202 + }, + { + "epoch": 0.7126748622297584, + "grad_norm": 0.9623811537878895, + "learning_rate": 4.024825016782727e-06, + "loss": 0.9332, + "step": 4203 + }, + { + "epoch": 0.7128444256040696, + "grad_norm": 0.9574647682292474, + "learning_rate": 4.020421889344455e-06, + "loss": 0.9175, + "step": 4204 + }, + { + "epoch": 0.7130139889783806, + "grad_norm": 0.9694909974894934, + "learning_rate": 4.016020565663974e-06, + "loss": 0.9166, + "step": 4205 + }, + { + "epoch": 0.7131835523526918, + "grad_norm": 1.0140401340404703, + "learning_rate": 4.0116210470689574e-06, + "loss": 0.9369, + "step": 4206 + }, + { + "epoch": 0.713353115727003, + "grad_norm": 0.9447651807114108, + "learning_rate": 4.007223334886531e-06, + "loss": 0.9214, + "step": 4207 + }, + { + "epoch": 0.7135226791013141, + "grad_norm": 0.9646063531442947, + "learning_rate": 4.002827430443284e-06, + "loss": 0.9394, + "step": 4208 + }, + { + "epoch": 0.7136922424756252, + "grad_norm": 0.9181795079791082, + "learning_rate": 3.998433335065251e-06, + "loss": 0.9082, + "step": 4209 + }, + { + "epoch": 0.7138618058499364, + "grad_norm": 1.015424388106665, + "learning_rate": 3.994041050077925e-06, + "loss": 0.9399, + "step": 4210 + }, + { + "epoch": 0.7140313692242476, + "grad_norm": 0.9686567633449988, + "learning_rate": 3.989650576806246e-06, + "loss": 0.9188, + "step": 4211 + }, + { + "epoch": 0.7142009325985587, + "grad_norm": 0.9561042222925232, + "learning_rate": 3.985261916574624e-06, + "loss": 0.9397, + "step": 4212 + }, + { + "epoch": 0.7143704959728698, + "grad_norm": 0.9817965420791707, + "learning_rate": 3.980875070706906e-06, + "loss": 0.9324, + "step": 4213 + }, + { + "epoch": 0.714540059347181, + "grad_norm": 0.9603639806927207, + "learning_rate": 3.976490040526394e-06, + "loss": 0.8832, + "step": 4214 + }, + { + "epoch": 0.7147096227214922, + "grad_norm": 1.006585295375615, + "learning_rate": 3.972106827355852e-06, + "loss": 0.9144, + "step": 4215 + }, + { + "epoch": 0.7148791860958033, + "grad_norm": 0.9635161966734009, + "learning_rate": 3.967725432517487e-06, + "loss": 0.9352, + "step": 4216 + }, + { + "epoch": 0.7150487494701144, + "grad_norm": 0.9530313424107987, + "learning_rate": 3.96334585733296e-06, + "loss": 0.9361, + "step": 4217 + }, + { + "epoch": 0.7152183128444256, + "grad_norm": 0.9541843397745873, + "learning_rate": 3.958968103123379e-06, + "loss": 0.9343, + "step": 4218 + }, + { + "epoch": 0.7153878762187368, + "grad_norm": 1.0136209810855443, + "learning_rate": 3.954592171209314e-06, + "loss": 0.9332, + "step": 4219 + }, + { + "epoch": 0.7155574395930479, + "grad_norm": 0.9610256292743388, + "learning_rate": 3.950218062910776e-06, + "loss": 0.9273, + "step": 4220 + }, + { + "epoch": 0.715727002967359, + "grad_norm": 0.9281779762621568, + "learning_rate": 3.9458457795472245e-06, + "loss": 0.9004, + "step": 4221 + }, + { + "epoch": 0.7158965663416702, + "grad_norm": 0.968142929073176, + "learning_rate": 3.941475322437574e-06, + "loss": 0.9198, + "step": 4222 + }, + { + "epoch": 0.7160661297159814, + "grad_norm": 1.0029503939820588, + "learning_rate": 3.937106692900188e-06, + "loss": 0.9197, + "step": 4223 + }, + { + "epoch": 0.7162356930902924, + "grad_norm": 0.9302128805116311, + "learning_rate": 3.932739892252875e-06, + "loss": 0.9122, + "step": 4224 + }, + { + "epoch": 0.7164052564646036, + "grad_norm": 0.978723942302488, + "learning_rate": 3.9283749218128885e-06, + "loss": 0.9306, + "step": 4225 + }, + { + "epoch": 0.7165748198389148, + "grad_norm": 1.01041538109347, + "learning_rate": 3.924011782896944e-06, + "loss": 0.993, + "step": 4226 + }, + { + "epoch": 0.716744383213226, + "grad_norm": 0.955139527247398, + "learning_rate": 3.919650476821192e-06, + "loss": 0.9292, + "step": 4227 + }, + { + "epoch": 0.716913946587537, + "grad_norm": 0.95086886340547, + "learning_rate": 3.9152910049012325e-06, + "loss": 0.9398, + "step": 4228 + }, + { + "epoch": 0.7170835099618482, + "grad_norm": 0.9678073358056275, + "learning_rate": 3.910933368452112e-06, + "loss": 0.9143, + "step": 4229 + }, + { + "epoch": 0.7172530733361594, + "grad_norm": 1.0046590240206252, + "learning_rate": 3.906577568788329e-06, + "loss": 0.943, + "step": 4230 + }, + { + "epoch": 0.7174226367104706, + "grad_norm": 0.9517738882754091, + "learning_rate": 3.902223607223822e-06, + "loss": 0.9481, + "step": 4231 + }, + { + "epoch": 0.7175922000847816, + "grad_norm": 0.9879663326320745, + "learning_rate": 3.897871485071973e-06, + "loss": 0.948, + "step": 4232 + }, + { + "epoch": 0.7177617634590928, + "grad_norm": 0.9564776765823783, + "learning_rate": 3.893521203645618e-06, + "loss": 0.9234, + "step": 4233 + }, + { + "epoch": 0.717931326833404, + "grad_norm": 0.90095485209741, + "learning_rate": 3.889172764257032e-06, + "loss": 0.9339, + "step": 4234 + }, + { + "epoch": 0.7181008902077152, + "grad_norm": 0.9469264043542964, + "learning_rate": 3.884826168217932e-06, + "loss": 0.9494, + "step": 4235 + }, + { + "epoch": 0.7182704535820262, + "grad_norm": 0.9930893579923585, + "learning_rate": 3.88048141683948e-06, + "loss": 0.9401, + "step": 4236 + }, + { + "epoch": 0.7184400169563374, + "grad_norm": 0.9515428483684868, + "learning_rate": 3.8761385114322905e-06, + "loss": 0.9515, + "step": 4237 + }, + { + "epoch": 0.7186095803306486, + "grad_norm": 1.013039131286697, + "learning_rate": 3.87179745330641e-06, + "loss": 0.9402, + "step": 4238 + }, + { + "epoch": 0.7187791437049598, + "grad_norm": 0.9325257726817211, + "learning_rate": 3.86745824377133e-06, + "loss": 0.9082, + "step": 4239 + }, + { + "epoch": 0.7189487070792708, + "grad_norm": 1.0064843016241667, + "learning_rate": 3.8631208841359906e-06, + "loss": 0.9828, + "step": 4240 + }, + { + "epoch": 0.719118270453582, + "grad_norm": 0.9456348616024043, + "learning_rate": 3.858785375708764e-06, + "loss": 0.9031, + "step": 4241 + }, + { + "epoch": 0.7192878338278932, + "grad_norm": 0.9586906255157827, + "learning_rate": 3.854451719797474e-06, + "loss": 0.9074, + "step": 4242 + }, + { + "epoch": 0.7194573972022044, + "grad_norm": 0.9476220560140591, + "learning_rate": 3.850119917709375e-06, + "loss": 0.961, + "step": 4243 + }, + { + "epoch": 0.7196269605765154, + "grad_norm": 0.9793403632134153, + "learning_rate": 3.845789970751177e-06, + "loss": 0.9512, + "step": 4244 + }, + { + "epoch": 0.7197965239508266, + "grad_norm": 1.0151291146686052, + "learning_rate": 3.841461880229016e-06, + "loss": 0.9393, + "step": 4245 + }, + { + "epoch": 0.7199660873251378, + "grad_norm": 1.0175045411665666, + "learning_rate": 3.837135647448475e-06, + "loss": 0.9264, + "step": 4246 + }, + { + "epoch": 0.720135650699449, + "grad_norm": 0.9579048523983019, + "learning_rate": 3.832811273714569e-06, + "loss": 0.9403, + "step": 4247 + }, + { + "epoch": 0.72030521407376, + "grad_norm": 0.9307016658230239, + "learning_rate": 3.82848876033177e-06, + "loss": 0.886, + "step": 4248 + }, + { + "epoch": 0.7204747774480712, + "grad_norm": 0.9564840745789503, + "learning_rate": 3.824168108603971e-06, + "loss": 0.9466, + "step": 4249 + }, + { + "epoch": 0.7206443408223824, + "grad_norm": 1.0285704958044026, + "learning_rate": 3.8198493198345054e-06, + "loss": 0.9437, + "step": 4250 + }, + { + "epoch": 0.7208139041966936, + "grad_norm": 0.9742554495708574, + "learning_rate": 3.815532395326157e-06, + "loss": 0.9629, + "step": 4251 + }, + { + "epoch": 0.7209834675710046, + "grad_norm": 0.9520880060031638, + "learning_rate": 3.8112173363811367e-06, + "loss": 0.9216, + "step": 4252 + }, + { + "epoch": 0.7211530309453158, + "grad_norm": 1.0469652022247202, + "learning_rate": 3.8069041443010924e-06, + "loss": 0.9702, + "step": 4253 + }, + { + "epoch": 0.721322594319627, + "grad_norm": 0.9560958643027575, + "learning_rate": 3.802592820387111e-06, + "loss": 0.9403, + "step": 4254 + }, + { + "epoch": 0.7214921576939382, + "grad_norm": 0.9735100429242013, + "learning_rate": 3.79828336593972e-06, + "loss": 0.931, + "step": 4255 + }, + { + "epoch": 0.7216617210682492, + "grad_norm": 0.9420872246822782, + "learning_rate": 3.7939757822588796e-06, + "loss": 0.9566, + "step": 4256 + }, + { + "epoch": 0.7218312844425604, + "grad_norm": 0.9563350353440808, + "learning_rate": 3.7896700706439826e-06, + "loss": 0.9049, + "step": 4257 + }, + { + "epoch": 0.7220008478168716, + "grad_norm": 0.9483657701006762, + "learning_rate": 3.785366232393861e-06, + "loss": 0.9272, + "step": 4258 + }, + { + "epoch": 0.7221704111911827, + "grad_norm": 0.9590795564816833, + "learning_rate": 3.78106426880678e-06, + "loss": 0.9078, + "step": 4259 + }, + { + "epoch": 0.7223399745654938, + "grad_norm": 0.9680953571066286, + "learning_rate": 3.7767641811804413e-06, + "loss": 0.9159, + "step": 4260 + }, + { + "epoch": 0.722509537939805, + "grad_norm": 0.937566607576362, + "learning_rate": 3.7724659708119737e-06, + "loss": 0.9183, + "step": 4261 + }, + { + "epoch": 0.7226791013141162, + "grad_norm": 0.9755092747352829, + "learning_rate": 3.768169638997954e-06, + "loss": 0.9327, + "step": 4262 + }, + { + "epoch": 0.7228486646884273, + "grad_norm": 1.0074563321805, + "learning_rate": 3.76387518703438e-06, + "loss": 0.9743, + "step": 4263 + }, + { + "epoch": 0.7230182280627384, + "grad_norm": 0.9590026076275152, + "learning_rate": 3.7595826162166816e-06, + "loss": 0.9225, + "step": 4264 + }, + { + "epoch": 0.7231877914370496, + "grad_norm": 0.9419879959385101, + "learning_rate": 3.7552919278397335e-06, + "loss": 0.9102, + "step": 4265 + }, + { + "epoch": 0.7233573548113608, + "grad_norm": 0.9767160352881189, + "learning_rate": 3.7510031231978328e-06, + "loss": 0.9281, + "step": 4266 + }, + { + "epoch": 0.7235269181856719, + "grad_norm": 0.9686765520598654, + "learning_rate": 3.746716203584707e-06, + "loss": 0.9289, + "step": 4267 + }, + { + "epoch": 0.723696481559983, + "grad_norm": 0.9943695123512349, + "learning_rate": 3.742431170293517e-06, + "loss": 0.9585, + "step": 4268 + }, + { + "epoch": 0.7238660449342942, + "grad_norm": 1.0322719004446468, + "learning_rate": 3.738148024616863e-06, + "loss": 0.9282, + "step": 4269 + }, + { + "epoch": 0.7240356083086054, + "grad_norm": 0.9785186357851025, + "learning_rate": 3.7338667678467642e-06, + "loss": 0.904, + "step": 4270 + }, + { + "epoch": 0.7242051716829165, + "grad_norm": 0.918256418346633, + "learning_rate": 3.729587401274677e-06, + "loss": 0.9125, + "step": 4271 + }, + { + "epoch": 0.7243747350572276, + "grad_norm": 0.9468923985214425, + "learning_rate": 3.7253099261914794e-06, + "loss": 0.9429, + "step": 4272 + }, + { + "epoch": 0.7245442984315388, + "grad_norm": 0.948710373599677, + "learning_rate": 3.7210343438874917e-06, + "loss": 0.9176, + "step": 4273 + }, + { + "epoch": 0.72471386180585, + "grad_norm": 0.9867316650313858, + "learning_rate": 3.7167606556524536e-06, + "loss": 0.9773, + "step": 4274 + }, + { + "epoch": 0.7248834251801611, + "grad_norm": 1.005742161312537, + "learning_rate": 3.7124888627755375e-06, + "loss": 0.9656, + "step": 4275 + }, + { + "epoch": 0.7250529885544722, + "grad_norm": 1.0428600549960652, + "learning_rate": 3.7082189665453396e-06, + "loss": 0.9733, + "step": 4276 + }, + { + "epoch": 0.7252225519287834, + "grad_norm": 0.9329611709680067, + "learning_rate": 3.7039509682498887e-06, + "loss": 0.8863, + "step": 4277 + }, + { + "epoch": 0.7253921153030946, + "grad_norm": 0.9162859928138347, + "learning_rate": 3.69968486917664e-06, + "loss": 0.9022, + "step": 4278 + }, + { + "epoch": 0.7255616786774057, + "grad_norm": 0.9246200974283728, + "learning_rate": 3.6954206706124697e-06, + "loss": 0.9187, + "step": 4279 + }, + { + "epoch": 0.7257312420517168, + "grad_norm": 0.9409091019944441, + "learning_rate": 3.691158373843694e-06, + "loss": 0.9192, + "step": 4280 + }, + { + "epoch": 0.725900805426028, + "grad_norm": 0.9543625791461442, + "learning_rate": 3.6868979801560443e-06, + "loss": 0.9281, + "step": 4281 + }, + { + "epoch": 0.7260703688003391, + "grad_norm": 0.9635825528069123, + "learning_rate": 3.6826394908346786e-06, + "loss": 0.9177, + "step": 4282 + }, + { + "epoch": 0.7262399321746503, + "grad_norm": 0.9549329906780929, + "learning_rate": 3.6783829071641886e-06, + "loss": 0.945, + "step": 4283 + }, + { + "epoch": 0.7264094955489614, + "grad_norm": 0.9698765301267126, + "learning_rate": 3.674128230428583e-06, + "loss": 0.9583, + "step": 4284 + }, + { + "epoch": 0.7265790589232726, + "grad_norm": 0.9890742037360992, + "learning_rate": 3.6698754619112974e-06, + "loss": 0.9066, + "step": 4285 + }, + { + "epoch": 0.7267486222975837, + "grad_norm": 0.6041378698757675, + "learning_rate": 3.6656246028951904e-06, + "loss": 0.728, + "step": 4286 + }, + { + "epoch": 0.7269181856718949, + "grad_norm": 0.9745715840706757, + "learning_rate": 3.6613756546625502e-06, + "loss": 0.9257, + "step": 4287 + }, + { + "epoch": 0.727087749046206, + "grad_norm": 0.963416366150616, + "learning_rate": 3.657128618495084e-06, + "loss": 0.9704, + "step": 4288 + }, + { + "epoch": 0.7272573124205172, + "grad_norm": 0.9454908742405014, + "learning_rate": 3.6528834956739224e-06, + "loss": 0.9664, + "step": 4289 + }, + { + "epoch": 0.7274268757948283, + "grad_norm": 0.9834697268432052, + "learning_rate": 3.6486402874796157e-06, + "loss": 0.9351, + "step": 4290 + }, + { + "epoch": 0.7275964391691395, + "grad_norm": 0.9418158876763204, + "learning_rate": 3.6443989951921478e-06, + "loss": 0.9346, + "step": 4291 + }, + { + "epoch": 0.7277660025434506, + "grad_norm": 0.9822541764143594, + "learning_rate": 3.640159620090913e-06, + "loss": 0.9398, + "step": 4292 + }, + { + "epoch": 0.7279355659177618, + "grad_norm": 0.9846264360581913, + "learning_rate": 3.6359221634547324e-06, + "loss": 0.9181, + "step": 4293 + }, + { + "epoch": 0.7281051292920729, + "grad_norm": 0.9562120488176881, + "learning_rate": 3.631686626561849e-06, + "loss": 0.9292, + "step": 4294 + }, + { + "epoch": 0.7282746926663841, + "grad_norm": 1.0002723048846975, + "learning_rate": 3.627453010689922e-06, + "loss": 0.943, + "step": 4295 + }, + { + "epoch": 0.7284442560406952, + "grad_norm": 0.9766585566785402, + "learning_rate": 3.6232213171160368e-06, + "loss": 0.9081, + "step": 4296 + }, + { + "epoch": 0.7286138194150064, + "grad_norm": 0.9863184771995902, + "learning_rate": 3.6189915471166927e-06, + "loss": 0.9567, + "step": 4297 + }, + { + "epoch": 0.7287833827893175, + "grad_norm": 0.9549767986374746, + "learning_rate": 3.6147637019678195e-06, + "loss": 0.9122, + "step": 4298 + }, + { + "epoch": 0.7289529461636286, + "grad_norm": 0.9023533776943351, + "learning_rate": 3.610537782944755e-06, + "loss": 0.9252, + "step": 4299 + }, + { + "epoch": 0.7291225095379398, + "grad_norm": 0.9435239300913615, + "learning_rate": 3.6063137913222578e-06, + "loss": 0.9172, + "step": 4300 + }, + { + "epoch": 0.729292072912251, + "grad_norm": 1.0021638431132271, + "learning_rate": 3.602091728374515e-06, + "loss": 0.9109, + "step": 4301 + }, + { + "epoch": 0.7294616362865621, + "grad_norm": 0.9681790969081856, + "learning_rate": 3.5978715953751207e-06, + "loss": 0.9167, + "step": 4302 + }, + { + "epoch": 0.7296311996608732, + "grad_norm": 0.9625817052513054, + "learning_rate": 3.5936533935970907e-06, + "loss": 0.9039, + "step": 4303 + }, + { + "epoch": 0.7298007630351844, + "grad_norm": 1.007769359857645, + "learning_rate": 3.5894371243128557e-06, + "loss": 0.9442, + "step": 4304 + }, + { + "epoch": 0.7299703264094956, + "grad_norm": 1.002781608466404, + "learning_rate": 3.5852227887942713e-06, + "loss": 0.8953, + "step": 4305 + }, + { + "epoch": 0.7301398897838067, + "grad_norm": 0.9180157018003722, + "learning_rate": 3.5810103883126023e-06, + "loss": 0.8952, + "step": 4306 + }, + { + "epoch": 0.7303094531581178, + "grad_norm": 1.017691205164924, + "learning_rate": 3.576799924138532e-06, + "loss": 0.8931, + "step": 4307 + }, + { + "epoch": 0.730479016532429, + "grad_norm": 0.9558936845782625, + "learning_rate": 3.5725913975421565e-06, + "loss": 0.8853, + "step": 4308 + }, + { + "epoch": 0.7306485799067401, + "grad_norm": 0.9217020223487714, + "learning_rate": 3.5683848097929963e-06, + "loss": 0.906, + "step": 4309 + }, + { + "epoch": 0.7308181432810513, + "grad_norm": 0.9721446511017972, + "learning_rate": 3.564180162159978e-06, + "loss": 0.9314, + "step": 4310 + }, + { + "epoch": 0.7309877066553624, + "grad_norm": 0.9749751793561413, + "learning_rate": 3.5599774559114475e-06, + "loss": 0.9332, + "step": 4311 + }, + { + "epoch": 0.7311572700296736, + "grad_norm": 0.9594564658418394, + "learning_rate": 3.555776692315163e-06, + "loss": 0.9207, + "step": 4312 + }, + { + "epoch": 0.7313268334039847, + "grad_norm": 0.9883754917667471, + "learning_rate": 3.5515778726382967e-06, + "loss": 0.9352, + "step": 4313 + }, + { + "epoch": 0.7314963967782959, + "grad_norm": 0.9671078267484942, + "learning_rate": 3.5473809981474363e-06, + "loss": 0.9198, + "step": 4314 + }, + { + "epoch": 0.731665960152607, + "grad_norm": 0.9401087319210902, + "learning_rate": 3.5431860701085785e-06, + "loss": 0.9106, + "step": 4315 + }, + { + "epoch": 0.7318355235269182, + "grad_norm": 0.6677505396187453, + "learning_rate": 3.5389930897871415e-06, + "loss": 0.7912, + "step": 4316 + }, + { + "epoch": 0.7320050869012293, + "grad_norm": 0.9978618635551892, + "learning_rate": 3.5348020584479492e-06, + "loss": 0.9603, + "step": 4317 + }, + { + "epoch": 0.7321746502755405, + "grad_norm": 0.9597117769790502, + "learning_rate": 3.5306129773552334e-06, + "loss": 0.9308, + "step": 4318 + }, + { + "epoch": 0.7323442136498516, + "grad_norm": 0.9794325778962216, + "learning_rate": 3.526425847772651e-06, + "loss": 0.9511, + "step": 4319 + }, + { + "epoch": 0.7325137770241628, + "grad_norm": 1.0091142931885073, + "learning_rate": 3.5222406709632584e-06, + "loss": 0.9708, + "step": 4320 + }, + { + "epoch": 0.7326833403984739, + "grad_norm": 0.9970407515158157, + "learning_rate": 3.518057448189527e-06, + "loss": 0.9608, + "step": 4321 + }, + { + "epoch": 0.7328529037727851, + "grad_norm": 0.9819706828917782, + "learning_rate": 3.5138761807133346e-06, + "loss": 0.9251, + "step": 4322 + }, + { + "epoch": 0.7330224671470962, + "grad_norm": 0.9785024398292148, + "learning_rate": 3.509696869795981e-06, + "loss": 0.9398, + "step": 4323 + }, + { + "epoch": 0.7331920305214074, + "grad_norm": 0.9769641943557719, + "learning_rate": 3.505519516698165e-06, + "loss": 0.9329, + "step": 4324 + }, + { + "epoch": 0.7333615938957185, + "grad_norm": 0.6443691569076802, + "learning_rate": 3.501344122679995e-06, + "loss": 0.7745, + "step": 4325 + }, + { + "epoch": 0.7335311572700297, + "grad_norm": 0.928221652945086, + "learning_rate": 3.4971706890009906e-06, + "loss": 0.9174, + "step": 4326 + }, + { + "epoch": 0.7337007206443408, + "grad_norm": 1.0141043117888362, + "learning_rate": 3.4929992169200865e-06, + "loss": 0.9342, + "step": 4327 + }, + { + "epoch": 0.733870284018652, + "grad_norm": 1.041849345447714, + "learning_rate": 3.4888297076956167e-06, + "loss": 0.9412, + "step": 4328 + }, + { + "epoch": 0.7340398473929631, + "grad_norm": 0.9647566157432904, + "learning_rate": 3.4846621625853248e-06, + "loss": 0.9232, + "step": 4329 + }, + { + "epoch": 0.7342094107672743, + "grad_norm": 0.9397124380689265, + "learning_rate": 3.4804965828463655e-06, + "loss": 0.9143, + "step": 4330 + }, + { + "epoch": 0.7343789741415854, + "grad_norm": 0.9468661092411789, + "learning_rate": 3.4763329697352976e-06, + "loss": 0.9143, + "step": 4331 + }, + { + "epoch": 0.7345485375158965, + "grad_norm": 0.9595394256736666, + "learning_rate": 3.4721713245080878e-06, + "loss": 0.9203, + "step": 4332 + }, + { + "epoch": 0.7347181008902077, + "grad_norm": 0.9419547132964262, + "learning_rate": 3.4680116484201055e-06, + "loss": 0.9067, + "step": 4333 + }, + { + "epoch": 0.7348876642645189, + "grad_norm": 0.9881132869036805, + "learning_rate": 3.463853942726135e-06, + "loss": 0.9498, + "step": 4334 + }, + { + "epoch": 0.73505722763883, + "grad_norm": 0.9687558685183632, + "learning_rate": 3.459698208680359e-06, + "loss": 0.9306, + "step": 4335 + }, + { + "epoch": 0.7352267910131411, + "grad_norm": 1.009164323084463, + "learning_rate": 3.4555444475363643e-06, + "loss": 0.9037, + "step": 4336 + }, + { + "epoch": 0.7353963543874523, + "grad_norm": 0.9548587415234554, + "learning_rate": 3.4513926605471504e-06, + "loss": 0.9455, + "step": 4337 + }, + { + "epoch": 0.7355659177617635, + "grad_norm": 1.0005783358612659, + "learning_rate": 3.4472428489651134e-06, + "loss": 0.8921, + "step": 4338 + }, + { + "epoch": 0.7357354811360746, + "grad_norm": 0.954040912121246, + "learning_rate": 3.443095014042058e-06, + "loss": 0.9332, + "step": 4339 + }, + { + "epoch": 0.7359050445103857, + "grad_norm": 0.9717284546618798, + "learning_rate": 3.4389491570291868e-06, + "loss": 0.9616, + "step": 4340 + }, + { + "epoch": 0.7360746078846969, + "grad_norm": 0.9043929797325935, + "learning_rate": 3.4348052791771158e-06, + "loss": 0.8906, + "step": 4341 + }, + { + "epoch": 0.7362441712590081, + "grad_norm": 0.952304884315069, + "learning_rate": 3.430663381735857e-06, + "loss": 0.8885, + "step": 4342 + }, + { + "epoch": 0.7364137346333192, + "grad_norm": 0.9803815163570732, + "learning_rate": 3.4265234659548243e-06, + "loss": 0.9525, + "step": 4343 + }, + { + "epoch": 0.7365832980076303, + "grad_norm": 1.0006934965146514, + "learning_rate": 3.422385533082834e-06, + "loss": 0.9686, + "step": 4344 + }, + { + "epoch": 0.7367528613819415, + "grad_norm": 0.9640683932132267, + "learning_rate": 3.4182495843681117e-06, + "loss": 0.9003, + "step": 4345 + }, + { + "epoch": 0.7369224247562527, + "grad_norm": 0.969118230461764, + "learning_rate": 3.414115621058276e-06, + "loss": 0.9339, + "step": 4346 + }, + { + "epoch": 0.7370919881305638, + "grad_norm": 0.9792486919277618, + "learning_rate": 3.4099836444003488e-06, + "loss": 0.962, + "step": 4347 + }, + { + "epoch": 0.7372615515048749, + "grad_norm": 1.0165065595790632, + "learning_rate": 3.405853655640754e-06, + "loss": 0.934, + "step": 4348 + }, + { + "epoch": 0.7374311148791861, + "grad_norm": 0.9976014783003594, + "learning_rate": 3.401725656025315e-06, + "loss": 0.9625, + "step": 4349 + }, + { + "epoch": 0.7376006782534973, + "grad_norm": 0.9967626834634267, + "learning_rate": 3.3975996467992557e-06, + "loss": 0.9482, + "step": 4350 + }, + { + "epoch": 0.7377702416278084, + "grad_norm": 0.9764694595798615, + "learning_rate": 3.3934756292071946e-06, + "loss": 0.9516, + "step": 4351 + }, + { + "epoch": 0.7379398050021195, + "grad_norm": 0.92378917130692, + "learning_rate": 3.389353604493163e-06, + "loss": 0.9097, + "step": 4352 + }, + { + "epoch": 0.7381093683764307, + "grad_norm": 0.9907911076246924, + "learning_rate": 3.385233573900576e-06, + "loss": 0.9098, + "step": 4353 + }, + { + "epoch": 0.7382789317507419, + "grad_norm": 1.0121982766756803, + "learning_rate": 3.3811155386722527e-06, + "loss": 0.9287, + "step": 4354 + }, + { + "epoch": 0.738448495125053, + "grad_norm": 0.9582858963980068, + "learning_rate": 3.3769995000504153e-06, + "loss": 0.9245, + "step": 4355 + }, + { + "epoch": 0.7386180584993641, + "grad_norm": 0.9269626281939402, + "learning_rate": 3.3728854592766768e-06, + "loss": 0.9178, + "step": 4356 + }, + { + "epoch": 0.7387876218736753, + "grad_norm": 0.9667980736317202, + "learning_rate": 3.3687734175920505e-06, + "loss": 0.9325, + "step": 4357 + }, + { + "epoch": 0.7389571852479865, + "grad_norm": 1.0229485136851653, + "learning_rate": 3.3646633762369417e-06, + "loss": 0.9542, + "step": 4358 + }, + { + "epoch": 0.7391267486222975, + "grad_norm": 0.9638447929120565, + "learning_rate": 3.3605553364511643e-06, + "loss": 0.9127, + "step": 4359 + }, + { + "epoch": 0.7392963119966087, + "grad_norm": 1.0390701949317438, + "learning_rate": 3.3564492994739183e-06, + "loss": 0.96, + "step": 4360 + }, + { + "epoch": 0.7394658753709199, + "grad_norm": 0.9798055057484563, + "learning_rate": 3.3523452665438004e-06, + "loss": 0.9894, + "step": 4361 + }, + { + "epoch": 0.7396354387452311, + "grad_norm": 0.9632728259207255, + "learning_rate": 3.348243238898802e-06, + "loss": 0.9473, + "step": 4362 + }, + { + "epoch": 0.7398050021195421, + "grad_norm": 0.9442079991953367, + "learning_rate": 3.344143217776319e-06, + "loss": 0.9505, + "step": 4363 + }, + { + "epoch": 0.7399745654938533, + "grad_norm": 0.9589125261233337, + "learning_rate": 3.3400452044131326e-06, + "loss": 0.926, + "step": 4364 + }, + { + "epoch": 0.7401441288681645, + "grad_norm": 0.9355900636455357, + "learning_rate": 3.3359492000454186e-06, + "loss": 0.9535, + "step": 4365 + }, + { + "epoch": 0.7403136922424757, + "grad_norm": 0.9867424295982432, + "learning_rate": 3.331855205908752e-06, + "loss": 0.9673, + "step": 4366 + }, + { + "epoch": 0.7404832556167867, + "grad_norm": 0.9679867092196275, + "learning_rate": 3.3277632232380953e-06, + "loss": 0.9046, + "step": 4367 + }, + { + "epoch": 0.7406528189910979, + "grad_norm": 0.9672549105450803, + "learning_rate": 3.3236732532678097e-06, + "loss": 0.9501, + "step": 4368 + }, + { + "epoch": 0.7408223823654091, + "grad_norm": 1.004712217809272, + "learning_rate": 3.3195852972316435e-06, + "loss": 0.9737, + "step": 4369 + }, + { + "epoch": 0.7409919457397203, + "grad_norm": 1.0124554558994783, + "learning_rate": 3.315499356362747e-06, + "loss": 0.9587, + "step": 4370 + }, + { + "epoch": 0.7411615091140313, + "grad_norm": 0.9850451662913168, + "learning_rate": 3.311415431893653e-06, + "loss": 0.8806, + "step": 4371 + }, + { + "epoch": 0.7413310724883425, + "grad_norm": 1.0021604429015305, + "learning_rate": 3.3073335250562866e-06, + "loss": 0.9516, + "step": 4372 + }, + { + "epoch": 0.7415006358626537, + "grad_norm": 0.9576189598378064, + "learning_rate": 3.3032536370819746e-06, + "loss": 0.9344, + "step": 4373 + }, + { + "epoch": 0.7416701992369649, + "grad_norm": 0.9641699289310747, + "learning_rate": 3.2991757692014238e-06, + "loss": 0.9244, + "step": 4374 + }, + { + "epoch": 0.7418397626112759, + "grad_norm": 0.9692703716764848, + "learning_rate": 3.2950999226447356e-06, + "loss": 0.9303, + "step": 4375 + }, + { + "epoch": 0.7420093259855871, + "grad_norm": 0.9768981911422421, + "learning_rate": 3.291026098641398e-06, + "loss": 0.927, + "step": 4376 + }, + { + "epoch": 0.7421788893598983, + "grad_norm": 0.9833521428431294, + "learning_rate": 3.2869542984202974e-06, + "loss": 0.903, + "step": 4377 + }, + { + "epoch": 0.7423484527342095, + "grad_norm": 1.039675443732567, + "learning_rate": 3.282884523209704e-06, + "loss": 0.9444, + "step": 4378 + }, + { + "epoch": 0.7425180161085205, + "grad_norm": 0.9851995301371924, + "learning_rate": 3.2788167742372725e-06, + "loss": 0.9246, + "step": 4379 + }, + { + "epoch": 0.7426875794828317, + "grad_norm": 1.0170174640694796, + "learning_rate": 3.2747510527300597e-06, + "loss": 0.9549, + "step": 4380 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 1.0032140548812238, + "learning_rate": 3.2706873599144973e-06, + "loss": 0.9599, + "step": 4381 + }, + { + "epoch": 0.7430267062314541, + "grad_norm": 0.937123527410746, + "learning_rate": 3.2666256970164135e-06, + "loss": 0.9447, + "step": 4382 + }, + { + "epoch": 0.7431962696057651, + "grad_norm": 0.9618582315698139, + "learning_rate": 3.262566065261015e-06, + "loss": 0.9006, + "step": 4383 + }, + { + "epoch": 0.7433658329800763, + "grad_norm": 0.9928462118917903, + "learning_rate": 3.2585084658729106e-06, + "loss": 0.9398, + "step": 4384 + }, + { + "epoch": 0.7435353963543875, + "grad_norm": 0.9365770827527956, + "learning_rate": 3.254452900076083e-06, + "loss": 0.9341, + "step": 4385 + }, + { + "epoch": 0.7437049597286987, + "grad_norm": 1.006633233963137, + "learning_rate": 3.2503993690939063e-06, + "loss": 0.9507, + "step": 4386 + }, + { + "epoch": 0.7438745231030097, + "grad_norm": 1.0046099838946616, + "learning_rate": 3.2463478741491404e-06, + "loss": 0.9409, + "step": 4387 + }, + { + "epoch": 0.7440440864773209, + "grad_norm": 1.0149787876367944, + "learning_rate": 3.2422984164639306e-06, + "loss": 0.9654, + "step": 4388 + }, + { + "epoch": 0.7442136498516321, + "grad_norm": 0.958802583541351, + "learning_rate": 3.2382509972598087e-06, + "loss": 0.9548, + "step": 4389 + }, + { + "epoch": 0.7443832132259431, + "grad_norm": 0.983243057761316, + "learning_rate": 3.2342056177576865e-06, + "loss": 0.922, + "step": 4390 + }, + { + "epoch": 0.7445527766002543, + "grad_norm": 0.9265538820349005, + "learning_rate": 3.230162279177873e-06, + "loss": 0.9133, + "step": 4391 + }, + { + "epoch": 0.7447223399745655, + "grad_norm": 0.9571164085635515, + "learning_rate": 3.2261209827400497e-06, + "loss": 0.911, + "step": 4392 + }, + { + "epoch": 0.7448919033488767, + "grad_norm": 0.9154398132947973, + "learning_rate": 3.2220817296632845e-06, + "loss": 0.8631, + "step": 4393 + }, + { + "epoch": 0.7450614667231877, + "grad_norm": 0.932430633889728, + "learning_rate": 3.2180445211660294e-06, + "loss": 0.8779, + "step": 4394 + }, + { + "epoch": 0.7452310300974989, + "grad_norm": 0.9597318615952581, + "learning_rate": 3.2140093584661247e-06, + "loss": 0.8876, + "step": 4395 + }, + { + "epoch": 0.7454005934718101, + "grad_norm": 0.958011630252295, + "learning_rate": 3.209976242780788e-06, + "loss": 0.9348, + "step": 4396 + }, + { + "epoch": 0.7455701568461213, + "grad_norm": 0.990831197118341, + "learning_rate": 3.205945175326617e-06, + "loss": 0.9455, + "step": 4397 + }, + { + "epoch": 0.7457397202204323, + "grad_norm": 0.9555986078041983, + "learning_rate": 3.201916157319601e-06, + "loss": 0.9378, + "step": 4398 + }, + { + "epoch": 0.7459092835947435, + "grad_norm": 0.9795793845504193, + "learning_rate": 3.197889189975103e-06, + "loss": 0.8949, + "step": 4399 + }, + { + "epoch": 0.7460788469690547, + "grad_norm": 0.988551161645154, + "learning_rate": 3.1938642745078706e-06, + "loss": 0.9074, + "step": 4400 + }, + { + "epoch": 0.7462484103433659, + "grad_norm": 1.0076510925634532, + "learning_rate": 3.1898414121320277e-06, + "loss": 0.9442, + "step": 4401 + }, + { + "epoch": 0.7464179737176769, + "grad_norm": 0.9890738866660633, + "learning_rate": 3.1858206040610883e-06, + "loss": 0.941, + "step": 4402 + }, + { + "epoch": 0.7465875370919881, + "grad_norm": 1.0312736010415136, + "learning_rate": 3.1818018515079396e-06, + "loss": 0.9422, + "step": 4403 + }, + { + "epoch": 0.7467571004662993, + "grad_norm": 1.0082089707531834, + "learning_rate": 3.1777851556848494e-06, + "loss": 0.9879, + "step": 4404 + }, + { + "epoch": 0.7469266638406105, + "grad_norm": 0.959407068923717, + "learning_rate": 3.173770517803467e-06, + "loss": 0.9382, + "step": 4405 + }, + { + "epoch": 0.7470962272149215, + "grad_norm": 0.9820399788517585, + "learning_rate": 3.1697579390748202e-06, + "loss": 0.9691, + "step": 4406 + }, + { + "epoch": 0.7472657905892327, + "grad_norm": 0.9653437690097237, + "learning_rate": 3.1657474207093144e-06, + "loss": 0.951, + "step": 4407 + }, + { + "epoch": 0.7474353539635439, + "grad_norm": 0.9852611448743962, + "learning_rate": 3.1617389639167316e-06, + "loss": 0.9477, + "step": 4408 + }, + { + "epoch": 0.7476049173378551, + "grad_norm": 1.0209220148025913, + "learning_rate": 3.1577325699062424e-06, + "loss": 0.9281, + "step": 4409 + }, + { + "epoch": 0.7477744807121661, + "grad_norm": 1.027427834225856, + "learning_rate": 3.1537282398863823e-06, + "loss": 0.9731, + "step": 4410 + }, + { + "epoch": 0.7479440440864773, + "grad_norm": 0.9876472299520029, + "learning_rate": 3.149725975065072e-06, + "loss": 0.9392, + "step": 4411 + }, + { + "epoch": 0.7481136074607885, + "grad_norm": 0.975123127162456, + "learning_rate": 3.145725776649602e-06, + "loss": 0.9467, + "step": 4412 + }, + { + "epoch": 0.7482831708350997, + "grad_norm": 1.0081454770458542, + "learning_rate": 3.1417276458466515e-06, + "loss": 0.9249, + "step": 4413 + }, + { + "epoch": 0.7484527342094107, + "grad_norm": 0.9467673110976009, + "learning_rate": 3.137731583862266e-06, + "loss": 0.9097, + "step": 4414 + }, + { + "epoch": 0.7486222975837219, + "grad_norm": 0.979814013393843, + "learning_rate": 3.133737591901864e-06, + "loss": 0.943, + "step": 4415 + }, + { + "epoch": 0.7487918609580331, + "grad_norm": 0.9629232998427483, + "learning_rate": 3.1297456711702532e-06, + "loss": 0.9821, + "step": 4416 + }, + { + "epoch": 0.7489614243323442, + "grad_norm": 0.9658686505250396, + "learning_rate": 3.125755822871607e-06, + "loss": 0.9692, + "step": 4417 + }, + { + "epoch": 0.7491309877066553, + "grad_norm": 1.0121199834544206, + "learning_rate": 3.1217680482094726e-06, + "loss": 0.9376, + "step": 4418 + }, + { + "epoch": 0.7493005510809665, + "grad_norm": 0.981257171121615, + "learning_rate": 3.117782348386772e-06, + "loss": 0.9243, + "step": 4419 + }, + { + "epoch": 0.7494701144552777, + "grad_norm": 0.9032061969602382, + "learning_rate": 3.11379872460581e-06, + "loss": 0.8745, + "step": 4420 + }, + { + "epoch": 0.7496396778295888, + "grad_norm": 0.9944265657051083, + "learning_rate": 3.1098171780682553e-06, + "loss": 0.9549, + "step": 4421 + }, + { + "epoch": 0.7498092412038999, + "grad_norm": 1.0039228848355548, + "learning_rate": 3.1058377099751537e-06, + "loss": 0.9578, + "step": 4422 + }, + { + "epoch": 0.7499788045782111, + "grad_norm": 1.0090429182119505, + "learning_rate": 3.101860321526924e-06, + "loss": 0.9465, + "step": 4423 + }, + { + "epoch": 0.7501483679525223, + "grad_norm": 1.0280748581628925, + "learning_rate": 3.0978850139233576e-06, + "loss": 0.934, + "step": 4424 + }, + { + "epoch": 0.7503179313268334, + "grad_norm": 1.013815393650934, + "learning_rate": 3.093911788363617e-06, + "loss": 0.9513, + "step": 4425 + }, + { + "epoch": 0.7504874947011445, + "grad_norm": 0.9234252456657739, + "learning_rate": 3.0899406460462354e-06, + "loss": 0.8779, + "step": 4426 + }, + { + "epoch": 0.7506570580754557, + "grad_norm": 0.9971617101171185, + "learning_rate": 3.0859715881691267e-06, + "loss": 0.9288, + "step": 4427 + }, + { + "epoch": 0.7508266214497669, + "grad_norm": 0.9951462303327652, + "learning_rate": 3.0820046159295647e-06, + "loss": 0.9497, + "step": 4428 + }, + { + "epoch": 0.750996184824078, + "grad_norm": 0.9692073969744446, + "learning_rate": 3.078039730524198e-06, + "loss": 0.9156, + "step": 4429 + }, + { + "epoch": 0.7511657481983891, + "grad_norm": 0.9639410091774172, + "learning_rate": 3.074076933149046e-06, + "loss": 0.9253, + "step": 4430 + }, + { + "epoch": 0.7513353115727003, + "grad_norm": 0.9357625059899958, + "learning_rate": 3.070116224999502e-06, + "loss": 0.8836, + "step": 4431 + }, + { + "epoch": 0.7515048749470115, + "grad_norm": 0.9746292810118539, + "learning_rate": 3.0661576072703247e-06, + "loss": 0.9517, + "step": 4432 + }, + { + "epoch": 0.7516744383213226, + "grad_norm": 0.9669739131779993, + "learning_rate": 3.062201081155637e-06, + "loss": 0.9173, + "step": 4433 + }, + { + "epoch": 0.7518440016956337, + "grad_norm": 1.0015131390982068, + "learning_rate": 3.0582466478489457e-06, + "loss": 0.9287, + "step": 4434 + }, + { + "epoch": 0.7520135650699449, + "grad_norm": 0.9830748487213814, + "learning_rate": 3.0542943085431144e-06, + "loss": 0.9329, + "step": 4435 + }, + { + "epoch": 0.752183128444256, + "grad_norm": 0.9613027760069434, + "learning_rate": 3.050344064430377e-06, + "loss": 0.9228, + "step": 4436 + }, + { + "epoch": 0.7523526918185672, + "grad_norm": 0.9626850124706385, + "learning_rate": 3.0463959167023336e-06, + "loss": 0.926, + "step": 4437 + }, + { + "epoch": 0.7525222551928783, + "grad_norm": 0.9394719976511217, + "learning_rate": 3.0424498665499613e-06, + "loss": 0.9251, + "step": 4438 + }, + { + "epoch": 0.7526918185671895, + "grad_norm": 1.0143082597357949, + "learning_rate": 3.0385059151635953e-06, + "loss": 0.9216, + "step": 4439 + }, + { + "epoch": 0.7528613819415007, + "grad_norm": 0.9556521437289349, + "learning_rate": 3.03456406373294e-06, + "loss": 0.9072, + "step": 4440 + }, + { + "epoch": 0.7530309453158118, + "grad_norm": 0.9923594703104305, + "learning_rate": 3.0306243134470668e-06, + "loss": 0.9492, + "step": 4441 + }, + { + "epoch": 0.7532005086901229, + "grad_norm": 1.0023781787885582, + "learning_rate": 3.026686665494414e-06, + "loss": 0.9592, + "step": 4442 + }, + { + "epoch": 0.7533700720644341, + "grad_norm": 1.0161754933638931, + "learning_rate": 3.0227511210627835e-06, + "loss": 0.9585, + "step": 4443 + }, + { + "epoch": 0.7535396354387452, + "grad_norm": 0.6646062478073941, + "learning_rate": 3.0188176813393433e-06, + "loss": 0.7528, + "step": 4444 + }, + { + "epoch": 0.7537091988130564, + "grad_norm": 0.9896821998226273, + "learning_rate": 3.0148863475106315e-06, + "loss": 0.946, + "step": 4445 + }, + { + "epoch": 0.7538787621873675, + "grad_norm": 0.9628167268998511, + "learning_rate": 3.0109571207625443e-06, + "loss": 0.9431, + "step": 4446 + }, + { + "epoch": 0.7540483255616787, + "grad_norm": 0.9820762405663909, + "learning_rate": 3.0070300022803454e-06, + "loss": 0.927, + "step": 4447 + }, + { + "epoch": 0.7542178889359898, + "grad_norm": 0.9994330736429512, + "learning_rate": 3.003104993248658e-06, + "loss": 0.9911, + "step": 4448 + }, + { + "epoch": 0.754387452310301, + "grad_norm": 0.981153701673931, + "learning_rate": 2.9991820948514795e-06, + "loss": 0.9124, + "step": 4449 + }, + { + "epoch": 0.7545570156846121, + "grad_norm": 0.9291673390802457, + "learning_rate": 2.9952613082721616e-06, + "loss": 0.9062, + "step": 4450 + }, + { + "epoch": 0.7547265790589233, + "grad_norm": 0.9346902629200073, + "learning_rate": 2.991342634693417e-06, + "loss": 0.9149, + "step": 4451 + }, + { + "epoch": 0.7548961424332344, + "grad_norm": 0.9422028183335565, + "learning_rate": 2.987426075297333e-06, + "loss": 0.9322, + "step": 4452 + }, + { + "epoch": 0.7550657058075456, + "grad_norm": 0.9339476139004957, + "learning_rate": 2.9835116312653477e-06, + "loss": 0.9297, + "step": 4453 + }, + { + "epoch": 0.7552352691818567, + "grad_norm": 0.9522436010519485, + "learning_rate": 2.9795993037782657e-06, + "loss": 0.9324, + "step": 4454 + }, + { + "epoch": 0.7554048325561679, + "grad_norm": 0.9685702584475706, + "learning_rate": 2.9756890940162476e-06, + "loss": 0.9415, + "step": 4455 + }, + { + "epoch": 0.755574395930479, + "grad_norm": 0.9460152176136803, + "learning_rate": 2.971781003158828e-06, + "loss": 0.9229, + "step": 4456 + }, + { + "epoch": 0.7557439593047902, + "grad_norm": 0.9239955569796248, + "learning_rate": 2.9678750323848893e-06, + "loss": 0.9053, + "step": 4457 + }, + { + "epoch": 0.7559135226791013, + "grad_norm": 0.9553112039641406, + "learning_rate": 2.9639711828726813e-06, + "loss": 0.9176, + "step": 4458 + }, + { + "epoch": 0.7560830860534125, + "grad_norm": 1.032655611523563, + "learning_rate": 2.960069455799811e-06, + "loss": 0.9821, + "step": 4459 + }, + { + "epoch": 0.7562526494277236, + "grad_norm": 0.929450690151674, + "learning_rate": 2.956169852343247e-06, + "loss": 0.8992, + "step": 4460 + }, + { + "epoch": 0.7564222128020348, + "grad_norm": 0.9461703354141494, + "learning_rate": 2.952272373679316e-06, + "loss": 0.9086, + "step": 4461 + }, + { + "epoch": 0.7565917761763459, + "grad_norm": 0.9472856658745393, + "learning_rate": 2.9483770209836993e-06, + "loss": 0.908, + "step": 4462 + }, + { + "epoch": 0.756761339550657, + "grad_norm": 0.9719929041670496, + "learning_rate": 2.9444837954314508e-06, + "loss": 0.9422, + "step": 4463 + }, + { + "epoch": 0.7569309029249682, + "grad_norm": 1.003345120438624, + "learning_rate": 2.94059269819697e-06, + "loss": 0.9498, + "step": 4464 + }, + { + "epoch": 0.7571004662992794, + "grad_norm": 0.9955550428003392, + "learning_rate": 2.936703730454017e-06, + "loss": 0.9364, + "step": 4465 + }, + { + "epoch": 0.7572700296735905, + "grad_norm": 0.9937816179963287, + "learning_rate": 2.9328168933757085e-06, + "loss": 0.9035, + "step": 4466 + }, + { + "epoch": 0.7574395930479016, + "grad_norm": 0.9583681910524312, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.9307, + "step": 4467 + }, + { + "epoch": 0.7576091564222128, + "grad_norm": 0.9303540279982168, + "learning_rate": 2.9250496159023e-06, + "loss": 0.9211, + "step": 4468 + }, + { + "epoch": 0.757778719796524, + "grad_norm": 0.9731788428952017, + "learning_rate": 2.9211691778502173e-06, + "loss": 0.9674, + "step": 4469 + }, + { + "epoch": 0.7579482831708351, + "grad_norm": 0.9984079188959125, + "learning_rate": 2.9172908751488292e-06, + "loss": 0.9464, + "step": 4470 + }, + { + "epoch": 0.7581178465451462, + "grad_norm": 0.9555255131406994, + "learning_rate": 2.9134147089680353e-06, + "loss": 0.9372, + "step": 4471 + }, + { + "epoch": 0.7582874099194574, + "grad_norm": 1.0057650558792657, + "learning_rate": 2.909540680477092e-06, + "loss": 0.955, + "step": 4472 + }, + { + "epoch": 0.7584569732937686, + "grad_norm": 0.9732212980932544, + "learning_rate": 2.90566879084461e-06, + "loss": 0.9337, + "step": 4473 + }, + { + "epoch": 0.7586265366680797, + "grad_norm": 0.9647999807636803, + "learning_rate": 2.901799041238561e-06, + "loss": 0.918, + "step": 4474 + }, + { + "epoch": 0.7587961000423908, + "grad_norm": 0.9664013894980109, + "learning_rate": 2.897931432826263e-06, + "loss": 0.9644, + "step": 4475 + }, + { + "epoch": 0.758965663416702, + "grad_norm": 0.9401404986411364, + "learning_rate": 2.8940659667743943e-06, + "loss": 0.9109, + "step": 4476 + }, + { + "epoch": 0.7591352267910132, + "grad_norm": 0.9673802829810737, + "learning_rate": 2.890202644248983e-06, + "loss": 0.9372, + "step": 4477 + }, + { + "epoch": 0.7593047901653243, + "grad_norm": 0.9411434275538323, + "learning_rate": 2.886341466415412e-06, + "loss": 0.9395, + "step": 4478 + }, + { + "epoch": 0.7594743535396354, + "grad_norm": 0.9611038043973206, + "learning_rate": 2.8824824344384174e-06, + "loss": 0.9175, + "step": 4479 + }, + { + "epoch": 0.7596439169139466, + "grad_norm": 0.943049025701545, + "learning_rate": 2.878625549482084e-06, + "loss": 0.9417, + "step": 4480 + }, + { + "epoch": 0.7598134802882577, + "grad_norm": 0.9665486758200093, + "learning_rate": 2.8747708127098593e-06, + "loss": 0.9615, + "step": 4481 + }, + { + "epoch": 0.7599830436625689, + "grad_norm": 1.01981147336692, + "learning_rate": 2.8709182252845347e-06, + "loss": 0.9763, + "step": 4482 + }, + { + "epoch": 0.76015260703688, + "grad_norm": 0.9835568993674455, + "learning_rate": 2.8670677883682527e-06, + "loss": 0.9281, + "step": 4483 + }, + { + "epoch": 0.7603221704111912, + "grad_norm": 0.9280438886337239, + "learning_rate": 2.8632195031225073e-06, + "loss": 0.8887, + "step": 4484 + }, + { + "epoch": 0.7604917337855023, + "grad_norm": 0.9704729739095292, + "learning_rate": 2.8593733707081516e-06, + "loss": 0.9188, + "step": 4485 + }, + { + "epoch": 0.7606612971598135, + "grad_norm": 0.9792503927390757, + "learning_rate": 2.85552939228538e-06, + "loss": 0.9045, + "step": 4486 + }, + { + "epoch": 0.7608308605341246, + "grad_norm": 0.9791749164475829, + "learning_rate": 2.851687569013737e-06, + "loss": 0.9218, + "step": 4487 + }, + { + "epoch": 0.7610004239084358, + "grad_norm": 0.9591203111043733, + "learning_rate": 2.8478479020521255e-06, + "loss": 0.9319, + "step": 4488 + }, + { + "epoch": 0.7611699872827469, + "grad_norm": 0.9752287848324088, + "learning_rate": 2.8440103925587904e-06, + "loss": 0.9497, + "step": 4489 + }, + { + "epoch": 0.761339550657058, + "grad_norm": 0.9973850629474763, + "learning_rate": 2.8401750416913275e-06, + "loss": 0.9488, + "step": 4490 + }, + { + "epoch": 0.7615091140313692, + "grad_norm": 0.9677755079847618, + "learning_rate": 2.83634185060668e-06, + "loss": 0.9063, + "step": 4491 + }, + { + "epoch": 0.7616786774056804, + "grad_norm": 0.5965392855131872, + "learning_rate": 2.832510820461146e-06, + "loss": 0.7169, + "step": 4492 + }, + { + "epoch": 0.7618482407799915, + "grad_norm": 0.9377319170330488, + "learning_rate": 2.8286819524103657e-06, + "loss": 0.8801, + "step": 4493 + }, + { + "epoch": 0.7620178041543026, + "grad_norm": 0.9706974634840667, + "learning_rate": 2.824855247609328e-06, + "loss": 0.9354, + "step": 4494 + }, + { + "epoch": 0.7621873675286138, + "grad_norm": 0.9938959904393707, + "learning_rate": 2.82103070721237e-06, + "loss": 0.9341, + "step": 4495 + }, + { + "epoch": 0.762356930902925, + "grad_norm": 0.9635217182667041, + "learning_rate": 2.817208332373177e-06, + "loss": 0.9471, + "step": 4496 + }, + { + "epoch": 0.7625264942772361, + "grad_norm": 0.9607075257366691, + "learning_rate": 2.813388124244778e-06, + "loss": 0.981, + "step": 4497 + }, + { + "epoch": 0.7626960576515472, + "grad_norm": 1.0329341316859268, + "learning_rate": 2.809570083979548e-06, + "loss": 0.9629, + "step": 4498 + }, + { + "epoch": 0.7628656210258584, + "grad_norm": 0.9719064531587033, + "learning_rate": 2.805754212729218e-06, + "loss": 0.9281, + "step": 4499 + }, + { + "epoch": 0.7630351844001696, + "grad_norm": 1.0690122655341157, + "learning_rate": 2.8019405116448516e-06, + "loss": 0.9309, + "step": 4500 + }, + { + "epoch": 0.7632047477744807, + "grad_norm": 1.0124808581623521, + "learning_rate": 2.798128981876864e-06, + "loss": 0.9331, + "step": 4501 + }, + { + "epoch": 0.7633743111487918, + "grad_norm": 0.9711883662935386, + "learning_rate": 2.7943196245750127e-06, + "loss": 0.8796, + "step": 4502 + }, + { + "epoch": 0.763543874523103, + "grad_norm": 0.961905607833146, + "learning_rate": 2.7905124408884076e-06, + "loss": 0.9203, + "step": 4503 + }, + { + "epoch": 0.7637134378974142, + "grad_norm": 1.0261005248638975, + "learning_rate": 2.786707431965493e-06, + "loss": 0.9263, + "step": 4504 + }, + { + "epoch": 0.7638830012717253, + "grad_norm": 0.944283608863925, + "learning_rate": 2.7829045989540594e-06, + "loss": 0.9547, + "step": 4505 + }, + { + "epoch": 0.7640525646460364, + "grad_norm": 0.9921116543325004, + "learning_rate": 2.779103943001248e-06, + "loss": 0.9296, + "step": 4506 + }, + { + "epoch": 0.7642221280203476, + "grad_norm": 0.9915669348823127, + "learning_rate": 2.775305465253536e-06, + "loss": 0.9147, + "step": 4507 + }, + { + "epoch": 0.7643916913946588, + "grad_norm": 1.0228704800265054, + "learning_rate": 2.771509166856745e-06, + "loss": 0.9081, + "step": 4508 + }, + { + "epoch": 0.7645612547689699, + "grad_norm": 0.9755221705155481, + "learning_rate": 2.7677150489560378e-06, + "loss": 0.9193, + "step": 4509 + }, + { + "epoch": 0.764730818143281, + "grad_norm": 0.9961639819622458, + "learning_rate": 2.7639231126959264e-06, + "loss": 0.937, + "step": 4510 + }, + { + "epoch": 0.7649003815175922, + "grad_norm": 1.0185472703407354, + "learning_rate": 2.7601333592202583e-06, + "loss": 0.9195, + "step": 4511 + }, + { + "epoch": 0.7650699448919034, + "grad_norm": 1.0135888117345444, + "learning_rate": 2.7563457896722225e-06, + "loss": 0.9562, + "step": 4512 + }, + { + "epoch": 0.7652395082662145, + "grad_norm": 0.9179687678155602, + "learning_rate": 2.7525604051943512e-06, + "loss": 0.9355, + "step": 4513 + }, + { + "epoch": 0.7654090716405256, + "grad_norm": 1.0091615158550655, + "learning_rate": 2.7487772069285166e-06, + "loss": 0.9503, + "step": 4514 + }, + { + "epoch": 0.7655786350148368, + "grad_norm": 0.9669593360718143, + "learning_rate": 2.7449961960159333e-06, + "loss": 0.9814, + "step": 4515 + }, + { + "epoch": 0.765748198389148, + "grad_norm": 0.9424567334266846, + "learning_rate": 2.7412173735971514e-06, + "loss": 0.9465, + "step": 4516 + }, + { + "epoch": 0.765917761763459, + "grad_norm": 0.9589192516992819, + "learning_rate": 2.7374407408120685e-06, + "loss": 0.9188, + "step": 4517 + }, + { + "epoch": 0.7660873251377702, + "grad_norm": 0.9973797361005567, + "learning_rate": 2.7336662987999164e-06, + "loss": 0.9261, + "step": 4518 + }, + { + "epoch": 0.7662568885120814, + "grad_norm": 0.9772477975643391, + "learning_rate": 2.7298940486992654e-06, + "loss": 0.9753, + "step": 4519 + }, + { + "epoch": 0.7664264518863926, + "grad_norm": 0.5890005498228047, + "learning_rate": 2.726123991648024e-06, + "loss": 0.7457, + "step": 4520 + }, + { + "epoch": 0.7665960152607036, + "grad_norm": 0.9896026548986693, + "learning_rate": 2.7223561287834467e-06, + "loss": 0.9302, + "step": 4521 + }, + { + "epoch": 0.7667655786350148, + "grad_norm": 1.0089883936550648, + "learning_rate": 2.7185904612421177e-06, + "loss": 0.973, + "step": 4522 + }, + { + "epoch": 0.766935142009326, + "grad_norm": 0.9334218459707342, + "learning_rate": 2.714826990159959e-06, + "loss": 0.9246, + "step": 4523 + }, + { + "epoch": 0.7671047053836372, + "grad_norm": 1.0114123598581302, + "learning_rate": 2.71106571667224e-06, + "loss": 0.9409, + "step": 4524 + }, + { + "epoch": 0.7672742687579482, + "grad_norm": 0.9654753150562002, + "learning_rate": 2.707306641913556e-06, + "loss": 0.9256, + "step": 4525 + }, + { + "epoch": 0.7674438321322594, + "grad_norm": 0.9730472901939335, + "learning_rate": 2.7035497670178447e-06, + "loss": 0.9659, + "step": 4526 + }, + { + "epoch": 0.7676133955065706, + "grad_norm": 0.9439084239167176, + "learning_rate": 2.6997950931183736e-06, + "loss": 0.9327, + "step": 4527 + }, + { + "epoch": 0.7677829588808818, + "grad_norm": 0.9694370175820768, + "learning_rate": 2.6960426213477587e-06, + "loss": 0.89, + "step": 4528 + }, + { + "epoch": 0.7679525222551928, + "grad_norm": 1.0420301073793403, + "learning_rate": 2.692292352837942e-06, + "loss": 0.9326, + "step": 4529 + }, + { + "epoch": 0.768122085629504, + "grad_norm": 0.9770183369404557, + "learning_rate": 2.688544288720202e-06, + "loss": 0.9303, + "step": 4530 + }, + { + "epoch": 0.7682916490038152, + "grad_norm": 0.9626644375355055, + "learning_rate": 2.684798430125154e-06, + "loss": 0.8655, + "step": 4531 + }, + { + "epoch": 0.7684612123781264, + "grad_norm": 0.949320915318544, + "learning_rate": 2.681054778182748e-06, + "loss": 0.9181, + "step": 4532 + }, + { + "epoch": 0.7686307757524374, + "grad_norm": 0.9806287327379162, + "learning_rate": 2.6773133340222677e-06, + "loss": 0.9476, + "step": 4533 + }, + { + "epoch": 0.7688003391267486, + "grad_norm": 1.0003800675508434, + "learning_rate": 2.673574098772328e-06, + "loss": 0.9088, + "step": 4534 + }, + { + "epoch": 0.7689699025010598, + "grad_norm": 1.0669664629692015, + "learning_rate": 2.669837073560887e-06, + "loss": 0.9375, + "step": 4535 + }, + { + "epoch": 0.769139465875371, + "grad_norm": 0.9483483359865884, + "learning_rate": 2.666102259515225e-06, + "loss": 0.9027, + "step": 4536 + }, + { + "epoch": 0.769309029249682, + "grad_norm": 0.9969605376018489, + "learning_rate": 2.662369657761963e-06, + "loss": 0.9525, + "step": 4537 + }, + { + "epoch": 0.7694785926239932, + "grad_norm": 0.9484319240986899, + "learning_rate": 2.6586392694270447e-06, + "loss": 0.8892, + "step": 4538 + }, + { + "epoch": 0.7696481559983044, + "grad_norm": 1.029377012576498, + "learning_rate": 2.6549110956357616e-06, + "loss": 0.954, + "step": 4539 + }, + { + "epoch": 0.7698177193726156, + "grad_norm": 0.9626292983425511, + "learning_rate": 2.651185137512725e-06, + "loss": 0.9356, + "step": 4540 + }, + { + "epoch": 0.7699872827469266, + "grad_norm": 0.9764107692421136, + "learning_rate": 2.6474613961818785e-06, + "loss": 0.9079, + "step": 4541 + }, + { + "epoch": 0.7701568461212378, + "grad_norm": 0.992007228976807, + "learning_rate": 2.6437398727665064e-06, + "loss": 0.9408, + "step": 4542 + }, + { + "epoch": 0.770326409495549, + "grad_norm": 1.0237555288161735, + "learning_rate": 2.640020568389213e-06, + "loss": 0.9277, + "step": 4543 + }, + { + "epoch": 0.7704959728698602, + "grad_norm": 1.0109892166818706, + "learning_rate": 2.6363034841719392e-06, + "loss": 0.9203, + "step": 4544 + }, + { + "epoch": 0.7706655362441712, + "grad_norm": 1.0086757517778953, + "learning_rate": 2.6325886212359496e-06, + "loss": 0.9718, + "step": 4545 + }, + { + "epoch": 0.7708350996184824, + "grad_norm": 0.955116575538711, + "learning_rate": 2.628875980701853e-06, + "loss": 0.9119, + "step": 4546 + }, + { + "epoch": 0.7710046629927936, + "grad_norm": 1.0343277032677225, + "learning_rate": 2.6251655636895725e-06, + "loss": 0.9409, + "step": 4547 + }, + { + "epoch": 0.7711742263671048, + "grad_norm": 0.963963943291546, + "learning_rate": 2.621457371318369e-06, + "loss": 0.9218, + "step": 4548 + }, + { + "epoch": 0.7713437897414158, + "grad_norm": 0.9467965919780134, + "learning_rate": 2.6177514047068287e-06, + "loss": 0.9324, + "step": 4549 + }, + { + "epoch": 0.771513353115727, + "grad_norm": 0.9867982116701469, + "learning_rate": 2.6140476649728673e-06, + "loss": 0.9205, + "step": 4550 + }, + { + "epoch": 0.7716829164900382, + "grad_norm": 1.0021998373541978, + "learning_rate": 2.6103461532337305e-06, + "loss": 0.9331, + "step": 4551 + }, + { + "epoch": 0.7718524798643494, + "grad_norm": 0.9548073436803785, + "learning_rate": 2.6066468706059857e-06, + "loss": 0.9201, + "step": 4552 + }, + { + "epoch": 0.7720220432386604, + "grad_norm": 0.9898353928826127, + "learning_rate": 2.602949818205539e-06, + "loss": 0.9256, + "step": 4553 + }, + { + "epoch": 0.7721916066129716, + "grad_norm": 0.9937073711476034, + "learning_rate": 2.5992549971476166e-06, + "loss": 0.9263, + "step": 4554 + }, + { + "epoch": 0.7723611699872828, + "grad_norm": 1.010621585589431, + "learning_rate": 2.59556240854677e-06, + "loss": 0.9341, + "step": 4555 + }, + { + "epoch": 0.772530733361594, + "grad_norm": 0.9866826861082622, + "learning_rate": 2.591872053516877e-06, + "loss": 0.9242, + "step": 4556 + }, + { + "epoch": 0.772700296735905, + "grad_norm": 0.9555609347168634, + "learning_rate": 2.5881839331711524e-06, + "loss": 0.8964, + "step": 4557 + }, + { + "epoch": 0.7728698601102162, + "grad_norm": 0.95903536597104, + "learning_rate": 2.5844980486221225e-06, + "loss": 0.9262, + "step": 4558 + }, + { + "epoch": 0.7730394234845274, + "grad_norm": 0.9461852995439718, + "learning_rate": 2.5808144009816448e-06, + "loss": 0.9071, + "step": 4559 + }, + { + "epoch": 0.7732089868588385, + "grad_norm": 1.0127071503564309, + "learning_rate": 2.577132991360909e-06, + "loss": 0.9434, + "step": 4560 + }, + { + "epoch": 0.7733785502331496, + "grad_norm": 0.9565675524156988, + "learning_rate": 2.5734538208704197e-06, + "loss": 0.9141, + "step": 4561 + }, + { + "epoch": 0.7735481136074608, + "grad_norm": 0.9886000095923463, + "learning_rate": 2.5697768906200084e-06, + "loss": 0.9459, + "step": 4562 + }, + { + "epoch": 0.773717676981772, + "grad_norm": 0.9601201041518646, + "learning_rate": 2.566102201718832e-06, + "loss": 0.9088, + "step": 4563 + }, + { + "epoch": 0.7738872403560831, + "grad_norm": 0.9595414947304067, + "learning_rate": 2.5624297552753753e-06, + "loss": 0.9159, + "step": 4564 + }, + { + "epoch": 0.7740568037303942, + "grad_norm": 1.0067506355137104, + "learning_rate": 2.5587595523974408e-06, + "loss": 0.9404, + "step": 4565 + }, + { + "epoch": 0.7742263671047054, + "grad_norm": 0.9872154514545771, + "learning_rate": 2.555091594192153e-06, + "loss": 0.9012, + "step": 4566 + }, + { + "epoch": 0.7743959304790166, + "grad_norm": 0.8960540726249263, + "learning_rate": 2.5514258817659685e-06, + "loss": 0.8986, + "step": 4567 + }, + { + "epoch": 0.7745654938533277, + "grad_norm": 1.0134530080426263, + "learning_rate": 2.5477624162246573e-06, + "loss": 0.9416, + "step": 4568 + }, + { + "epoch": 0.7747350572276388, + "grad_norm": 0.9570592798958361, + "learning_rate": 2.5441011986733165e-06, + "loss": 0.9485, + "step": 4569 + }, + { + "epoch": 0.77490462060195, + "grad_norm": 1.0309560713032377, + "learning_rate": 2.540442230216361e-06, + "loss": 0.955, + "step": 4570 + }, + { + "epoch": 0.7750741839762612, + "grad_norm": 0.9899029076169737, + "learning_rate": 2.5367855119575314e-06, + "loss": 0.9393, + "step": 4571 + }, + { + "epoch": 0.7752437473505722, + "grad_norm": 0.9401622652468724, + "learning_rate": 2.533131044999887e-06, + "loss": 0.9084, + "step": 4572 + }, + { + "epoch": 0.7754133107248834, + "grad_norm": 0.9131179357665576, + "learning_rate": 2.5294788304458063e-06, + "loss": 0.9234, + "step": 4573 + }, + { + "epoch": 0.7755828740991946, + "grad_norm": 0.9316857143154513, + "learning_rate": 2.5258288693969968e-06, + "loss": 0.9284, + "step": 4574 + }, + { + "epoch": 0.7757524374735058, + "grad_norm": 0.9999576473093155, + "learning_rate": 2.5221811629544768e-06, + "loss": 0.9458, + "step": 4575 + }, + { + "epoch": 0.7759220008478168, + "grad_norm": 0.9955456412044222, + "learning_rate": 2.518535712218587e-06, + "loss": 0.9525, + "step": 4576 + }, + { + "epoch": 0.776091564222128, + "grad_norm": 0.9364793313670482, + "learning_rate": 2.514892518288988e-06, + "loss": 0.9411, + "step": 4577 + }, + { + "epoch": 0.7762611275964392, + "grad_norm": 0.9498691294792266, + "learning_rate": 2.5112515822646655e-06, + "loss": 0.9439, + "step": 4578 + }, + { + "epoch": 0.7764306909707503, + "grad_norm": 0.9742256271138886, + "learning_rate": 2.507612905243916e-06, + "loss": 0.9391, + "step": 4579 + }, + { + "epoch": 0.7766002543450614, + "grad_norm": 0.9867988929491583, + "learning_rate": 2.5039764883243555e-06, + "loss": 0.9258, + "step": 4580 + }, + { + "epoch": 0.7767698177193726, + "grad_norm": 0.9779290388337014, + "learning_rate": 2.5003423326029187e-06, + "loss": 0.936, + "step": 4581 + }, + { + "epoch": 0.7769393810936838, + "grad_norm": 0.9969522311928066, + "learning_rate": 2.4967104391758657e-06, + "loss": 0.9302, + "step": 4582 + }, + { + "epoch": 0.7771089444679949, + "grad_norm": 0.9804579517930131, + "learning_rate": 2.493080809138765e-06, + "loss": 0.9545, + "step": 4583 + }, + { + "epoch": 0.777278507842306, + "grad_norm": 0.9576352820090382, + "learning_rate": 2.4894534435865015e-06, + "loss": 0.9273, + "step": 4584 + }, + { + "epoch": 0.7774480712166172, + "grad_norm": 0.9416018054001984, + "learning_rate": 2.485828343613288e-06, + "loss": 0.9347, + "step": 4585 + }, + { + "epoch": 0.7776176345909284, + "grad_norm": 1.0175922324651776, + "learning_rate": 2.482205510312644e-06, + "loss": 0.9214, + "step": 4586 + }, + { + "epoch": 0.7777871979652395, + "grad_norm": 0.9605266022301537, + "learning_rate": 2.478584944777408e-06, + "loss": 0.9166, + "step": 4587 + }, + { + "epoch": 0.7779567613395506, + "grad_norm": 0.9771414522952164, + "learning_rate": 2.4749666480997336e-06, + "loss": 0.9306, + "step": 4588 + }, + { + "epoch": 0.7781263247138618, + "grad_norm": 0.9972343904399632, + "learning_rate": 2.4713506213710924e-06, + "loss": 0.8713, + "step": 4589 + }, + { + "epoch": 0.778295888088173, + "grad_norm": 0.9888592912550048, + "learning_rate": 2.467736865682269e-06, + "loss": 0.9205, + "step": 4590 + }, + { + "epoch": 0.7784654514624841, + "grad_norm": 0.9194702433723356, + "learning_rate": 2.46412538212336e-06, + "loss": 0.9308, + "step": 4591 + }, + { + "epoch": 0.7786350148367952, + "grad_norm": 0.9609175606956006, + "learning_rate": 2.4605161717837866e-06, + "loss": 0.8881, + "step": 4592 + }, + { + "epoch": 0.7788045782111064, + "grad_norm": 0.9563015470368535, + "learning_rate": 2.456909235752276e-06, + "loss": 0.8982, + "step": 4593 + }, + { + "epoch": 0.7789741415854176, + "grad_norm": 0.9356709164135482, + "learning_rate": 2.4533045751168703e-06, + "loss": 0.8836, + "step": 4594 + }, + { + "epoch": 0.7791437049597287, + "grad_norm": 0.5796414686874024, + "learning_rate": 2.4497021909649252e-06, + "loss": 0.7278, + "step": 4595 + }, + { + "epoch": 0.7793132683340398, + "grad_norm": 0.9677676567011475, + "learning_rate": 2.446102084383114e-06, + "loss": 0.941, + "step": 4596 + }, + { + "epoch": 0.779482831708351, + "grad_norm": 0.6698494995233439, + "learning_rate": 2.4425042564574186e-06, + "loss": 0.7981, + "step": 4597 + }, + { + "epoch": 0.7796523950826622, + "grad_norm": 0.9949907503026933, + "learning_rate": 2.4389087082731333e-06, + "loss": 0.8962, + "step": 4598 + }, + { + "epoch": 0.7798219584569733, + "grad_norm": 1.0029170594196486, + "learning_rate": 2.4353154409148637e-06, + "loss": 0.9486, + "step": 4599 + }, + { + "epoch": 0.7799915218312844, + "grad_norm": 0.9477840888037887, + "learning_rate": 2.4317244554665363e-06, + "loss": 0.8876, + "step": 4600 + }, + { + "epoch": 0.7801610852055956, + "grad_norm": 1.01575366322494, + "learning_rate": 2.4281357530113804e-06, + "loss": 0.953, + "step": 4601 + }, + { + "epoch": 0.7803306485799067, + "grad_norm": 0.9585513184495699, + "learning_rate": 2.424549334631934e-06, + "loss": 0.8769, + "step": 4602 + }, + { + "epoch": 0.7805002119542179, + "grad_norm": 1.0138760329834557, + "learning_rate": 2.420965201410057e-06, + "loss": 0.9835, + "step": 4603 + }, + { + "epoch": 0.780669775328529, + "grad_norm": 0.9400887669248903, + "learning_rate": 2.417383354426912e-06, + "loss": 0.8921, + "step": 4604 + }, + { + "epoch": 0.7808393387028402, + "grad_norm": 0.9405632267543659, + "learning_rate": 2.4138037947629743e-06, + "loss": 0.9733, + "step": 4605 + }, + { + "epoch": 0.7810089020771513, + "grad_norm": 1.011495831662797, + "learning_rate": 2.4102265234980283e-06, + "loss": 0.918, + "step": 4606 + }, + { + "epoch": 0.7811784654514625, + "grad_norm": 0.9186774967759711, + "learning_rate": 2.406651541711169e-06, + "loss": 0.9043, + "step": 4607 + }, + { + "epoch": 0.7813480288257736, + "grad_norm": 1.0173055488937643, + "learning_rate": 2.4030788504808e-06, + "loss": 0.9523, + "step": 4608 + }, + { + "epoch": 0.7815175922000848, + "grad_norm": 0.9678865161540681, + "learning_rate": 2.399508450884631e-06, + "loss": 0.9137, + "step": 4609 + }, + { + "epoch": 0.7816871555743959, + "grad_norm": 0.9614206121502811, + "learning_rate": 2.395940343999691e-06, + "loss": 0.9252, + "step": 4610 + }, + { + "epoch": 0.7818567189487071, + "grad_norm": 0.95624753773201, + "learning_rate": 2.3923745309023072e-06, + "loss": 0.9219, + "step": 4611 + }, + { + "epoch": 0.7820262823230182, + "grad_norm": 0.9569151513549119, + "learning_rate": 2.3888110126681163e-06, + "loss": 0.9238, + "step": 4612 + }, + { + "epoch": 0.7821958456973294, + "grad_norm": 1.009470111931034, + "learning_rate": 2.3852497903720626e-06, + "loss": 0.9833, + "step": 4613 + }, + { + "epoch": 0.7823654090716405, + "grad_norm": 1.0000534269331767, + "learning_rate": 2.3816908650884063e-06, + "loss": 0.9607, + "step": 4614 + }, + { + "epoch": 0.7825349724459517, + "grad_norm": 1.0048695176894948, + "learning_rate": 2.3781342378907023e-06, + "loss": 0.9327, + "step": 4615 + }, + { + "epoch": 0.7827045358202628, + "grad_norm": 0.9429319987981376, + "learning_rate": 2.3745799098518208e-06, + "loss": 0.9316, + "step": 4616 + }, + { + "epoch": 0.782874099194574, + "grad_norm": 0.9660451250704443, + "learning_rate": 2.3710278820439313e-06, + "loss": 0.9452, + "step": 4617 + }, + { + "epoch": 0.7830436625688851, + "grad_norm": 0.9371194277097062, + "learning_rate": 2.3674781555385197e-06, + "loss": 0.9498, + "step": 4618 + }, + { + "epoch": 0.7832132259431963, + "grad_norm": 1.0346455180338319, + "learning_rate": 2.363930731406369e-06, + "loss": 0.9112, + "step": 4619 + }, + { + "epoch": 0.7833827893175074, + "grad_norm": 0.9541504666714702, + "learning_rate": 2.360385610717567e-06, + "loss": 0.914, + "step": 4620 + }, + { + "epoch": 0.7835523526918186, + "grad_norm": 0.6617943696480774, + "learning_rate": 2.3568427945415163e-06, + "loss": 0.7922, + "step": 4621 + }, + { + "epoch": 0.7837219160661297, + "grad_norm": 1.0097916238720612, + "learning_rate": 2.3533022839469154e-06, + "loss": 0.9438, + "step": 4622 + }, + { + "epoch": 0.7838914794404409, + "grad_norm": 1.0156756044780897, + "learning_rate": 2.3497640800017687e-06, + "loss": 0.9333, + "step": 4623 + }, + { + "epoch": 0.784061042814752, + "grad_norm": 1.0161261544899474, + "learning_rate": 2.346228183773388e-06, + "loss": 0.9657, + "step": 4624 + }, + { + "epoch": 0.7842306061890632, + "grad_norm": 0.9804343650680064, + "learning_rate": 2.3426945963283853e-06, + "loss": 0.934, + "step": 4625 + }, + { + "epoch": 0.7844001695633743, + "grad_norm": 0.9721135344404941, + "learning_rate": 2.3391633187326802e-06, + "loss": 0.9113, + "step": 4626 + }, + { + "epoch": 0.7845697329376855, + "grad_norm": 1.0070824174582935, + "learning_rate": 2.335634352051488e-06, + "loss": 0.9481, + "step": 4627 + }, + { + "epoch": 0.7847392963119966, + "grad_norm": 0.9145311520420111, + "learning_rate": 2.3321076973493396e-06, + "loss": 0.9041, + "step": 4628 + }, + { + "epoch": 0.7849088596863077, + "grad_norm": 0.9527946683105675, + "learning_rate": 2.328583355690056e-06, + "loss": 0.949, + "step": 4629 + }, + { + "epoch": 0.7850784230606189, + "grad_norm": 0.973671720757159, + "learning_rate": 2.3250613281367686e-06, + "loss": 0.9477, + "step": 4630 + }, + { + "epoch": 0.7852479864349301, + "grad_norm": 0.6159806016999128, + "learning_rate": 2.3215416157519023e-06, + "loss": 0.7851, + "step": 4631 + }, + { + "epoch": 0.7854175498092412, + "grad_norm": 0.9906897486481537, + "learning_rate": 2.318024219597196e-06, + "loss": 0.9333, + "step": 4632 + }, + { + "epoch": 0.7855871131835523, + "grad_norm": 0.9689762304004497, + "learning_rate": 2.3145091407336785e-06, + "loss": 0.9326, + "step": 4633 + }, + { + "epoch": 0.7857566765578635, + "grad_norm": 0.978384954222861, + "learning_rate": 2.3109963802216863e-06, + "loss": 0.9614, + "step": 4634 + }, + { + "epoch": 0.7859262399321747, + "grad_norm": 1.0009916482629015, + "learning_rate": 2.3074859391208494e-06, + "loss": 0.9314, + "step": 4635 + }, + { + "epoch": 0.7860958033064858, + "grad_norm": 1.0297972356344058, + "learning_rate": 2.3039778184901086e-06, + "loss": 0.9782, + "step": 4636 + }, + { + "epoch": 0.7862653666807969, + "grad_norm": 0.9280285053923906, + "learning_rate": 2.3004720193876972e-06, + "loss": 0.9071, + "step": 4637 + }, + { + "epoch": 0.7864349300551081, + "grad_norm": 0.9941700642047782, + "learning_rate": 2.2969685428711474e-06, + "loss": 0.9241, + "step": 4638 + }, + { + "epoch": 0.7866044934294193, + "grad_norm": 0.6647785117908844, + "learning_rate": 2.293467389997299e-06, + "loss": 0.7841, + "step": 4639 + }, + { + "epoch": 0.7867740568037304, + "grad_norm": 0.9862723693615132, + "learning_rate": 2.289968561822282e-06, + "loss": 0.9258, + "step": 4640 + }, + { + "epoch": 0.7869436201780415, + "grad_norm": 0.9484606916472316, + "learning_rate": 2.2864720594015288e-06, + "loss": 0.8873, + "step": 4641 + }, + { + "epoch": 0.7871131835523527, + "grad_norm": 0.9335256089712528, + "learning_rate": 2.2829778837897696e-06, + "loss": 0.8744, + "step": 4642 + }, + { + "epoch": 0.7872827469266639, + "grad_norm": 0.5691102090424409, + "learning_rate": 2.279486036041034e-06, + "loss": 0.7239, + "step": 4643 + }, + { + "epoch": 0.787452310300975, + "grad_norm": 0.9825968589176427, + "learning_rate": 2.2759965172086474e-06, + "loss": 0.9306, + "step": 4644 + }, + { + "epoch": 0.7876218736752861, + "grad_norm": 0.9564621383240213, + "learning_rate": 2.2725093283452305e-06, + "loss": 0.9129, + "step": 4645 + }, + { + "epoch": 0.7877914370495973, + "grad_norm": 0.9601210859865337, + "learning_rate": 2.269024470502711e-06, + "loss": 0.9283, + "step": 4646 + }, + { + "epoch": 0.7879610004239085, + "grad_norm": 0.9687480256909964, + "learning_rate": 2.2655419447323035e-06, + "loss": 0.9035, + "step": 4647 + }, + { + "epoch": 0.7881305637982196, + "grad_norm": 0.9753461758185377, + "learning_rate": 2.262061752084522e-06, + "loss": 0.9414, + "step": 4648 + }, + { + "epoch": 0.7883001271725307, + "grad_norm": 0.92653080541803, + "learning_rate": 2.2585838936091753e-06, + "loss": 0.9551, + "step": 4649 + }, + { + "epoch": 0.7884696905468419, + "grad_norm": 0.9624997063709831, + "learning_rate": 2.2551083703553755e-06, + "loss": 0.918, + "step": 4650 + }, + { + "epoch": 0.7886392539211531, + "grad_norm": 0.9665142475004171, + "learning_rate": 2.251635183371521e-06, + "loss": 0.9023, + "step": 4651 + }, + { + "epoch": 0.7888088172954641, + "grad_norm": 0.610728222213766, + "learning_rate": 2.2481643337053095e-06, + "loss": 0.7961, + "step": 4652 + }, + { + "epoch": 0.7889783806697753, + "grad_norm": 1.0218472449110727, + "learning_rate": 2.244695822403731e-06, + "loss": 0.9384, + "step": 4653 + }, + { + "epoch": 0.7891479440440865, + "grad_norm": 0.9498022811356979, + "learning_rate": 2.241229650513077e-06, + "loss": 0.9219, + "step": 4654 + }, + { + "epoch": 0.7893175074183977, + "grad_norm": 0.9450072519810307, + "learning_rate": 2.2377658190789263e-06, + "loss": 0.9128, + "step": 4655 + }, + { + "epoch": 0.7894870707927087, + "grad_norm": 1.049688709567374, + "learning_rate": 2.234304329146152e-06, + "loss": 0.9233, + "step": 4656 + }, + { + "epoch": 0.7896566341670199, + "grad_norm": 0.9796988891329942, + "learning_rate": 2.230845181758928e-06, + "loss": 0.9518, + "step": 4657 + }, + { + "epoch": 0.7898261975413311, + "grad_norm": 0.9809919913716446, + "learning_rate": 2.2273883779607142e-06, + "loss": 0.9276, + "step": 4658 + }, + { + "epoch": 0.7899957609156423, + "grad_norm": 0.968919751988224, + "learning_rate": 2.2239339187942653e-06, + "loss": 0.8738, + "step": 4659 + }, + { + "epoch": 0.7901653242899533, + "grad_norm": 0.9516042657289775, + "learning_rate": 2.2204818053016286e-06, + "loss": 0.9047, + "step": 4660 + }, + { + "epoch": 0.7903348876642645, + "grad_norm": 0.9369762346753585, + "learning_rate": 2.2170320385241475e-06, + "loss": 0.8771, + "step": 4661 + }, + { + "epoch": 0.7905044510385757, + "grad_norm": 0.9743103585735804, + "learning_rate": 2.213584619502451e-06, + "loss": 0.9268, + "step": 4662 + }, + { + "epoch": 0.7906740144128869, + "grad_norm": 0.9820394981217843, + "learning_rate": 2.2101395492764623e-06, + "loss": 0.9273, + "step": 4663 + }, + { + "epoch": 0.7908435777871979, + "grad_norm": 0.9461391037416478, + "learning_rate": 2.206696828885403e-06, + "loss": 0.929, + "step": 4664 + }, + { + "epoch": 0.7910131411615091, + "grad_norm": 0.9271677872468416, + "learning_rate": 2.2032564593677773e-06, + "loss": 0.9155, + "step": 4665 + }, + { + "epoch": 0.7911827045358203, + "grad_norm": 0.9655526067282698, + "learning_rate": 2.199818441761383e-06, + "loss": 0.9449, + "step": 4666 + }, + { + "epoch": 0.7913522679101314, + "grad_norm": 0.670667135717453, + "learning_rate": 2.1963827771033053e-06, + "loss": 0.7807, + "step": 4667 + }, + { + "epoch": 0.7915218312844425, + "grad_norm": 0.9724236998091764, + "learning_rate": 2.192949466429929e-06, + "loss": 0.9066, + "step": 4668 + }, + { + "epoch": 0.7916913946587537, + "grad_norm": 0.9945015401439281, + "learning_rate": 2.189518510776919e-06, + "loss": 0.9376, + "step": 4669 + }, + { + "epoch": 0.7918609580330649, + "grad_norm": 0.9856264100769789, + "learning_rate": 2.1860899111792343e-06, + "loss": 0.9589, + "step": 4670 + }, + { + "epoch": 0.792030521407376, + "grad_norm": 0.9663913871456851, + "learning_rate": 2.182663668671119e-06, + "loss": 0.9129, + "step": 4671 + }, + { + "epoch": 0.7922000847816871, + "grad_norm": 0.9462425753041646, + "learning_rate": 2.1792397842861156e-06, + "loss": 0.933, + "step": 4672 + }, + { + "epoch": 0.7923696481559983, + "grad_norm": 0.9540240836225714, + "learning_rate": 2.1758182590570454e-06, + "loss": 0.9208, + "step": 4673 + }, + { + "epoch": 0.7925392115303095, + "grad_norm": 0.9665364142248658, + "learning_rate": 2.17239909401602e-06, + "loss": 0.8776, + "step": 4674 + }, + { + "epoch": 0.7927087749046206, + "grad_norm": 0.9333832088508213, + "learning_rate": 2.1689822901944456e-06, + "loss": 0.9418, + "step": 4675 + }, + { + "epoch": 0.7928783382789317, + "grad_norm": 0.9586033569905082, + "learning_rate": 2.165567848623009e-06, + "loss": 0.9451, + "step": 4676 + }, + { + "epoch": 0.7930479016532429, + "grad_norm": 0.9597056144881574, + "learning_rate": 2.1621557703316876e-06, + "loss": 0.9595, + "step": 4677 + }, + { + "epoch": 0.7932174650275541, + "grad_norm": 0.9555872140892432, + "learning_rate": 2.158746056349744e-06, + "loss": 0.9402, + "step": 4678 + }, + { + "epoch": 0.7933870284018651, + "grad_norm": 0.9679237813005439, + "learning_rate": 2.15533870770573e-06, + "loss": 0.9331, + "step": 4679 + }, + { + "epoch": 0.7935565917761763, + "grad_norm": 0.9779881149786491, + "learning_rate": 2.151933725427481e-06, + "loss": 0.9351, + "step": 4680 + }, + { + "epoch": 0.7937261551504875, + "grad_norm": 0.9734316327419267, + "learning_rate": 2.148531110542118e-06, + "loss": 0.8757, + "step": 4681 + }, + { + "epoch": 0.7938957185247987, + "grad_norm": 0.9719291624253216, + "learning_rate": 2.145130864076055e-06, + "loss": 0.9153, + "step": 4682 + }, + { + "epoch": 0.7940652818991097, + "grad_norm": 0.9830559712044682, + "learning_rate": 2.1417329870549852e-06, + "loss": 0.8693, + "step": 4683 + }, + { + "epoch": 0.7942348452734209, + "grad_norm": 0.9401441818180429, + "learning_rate": 2.138337480503888e-06, + "loss": 0.9559, + "step": 4684 + }, + { + "epoch": 0.7944044086477321, + "grad_norm": 0.9726940934553796, + "learning_rate": 2.1349443454470254e-06, + "loss": 0.9369, + "step": 4685 + }, + { + "epoch": 0.7945739720220433, + "grad_norm": 0.9510115605898624, + "learning_rate": 2.1315535829079524e-06, + "loss": 0.8917, + "step": 4686 + }, + { + "epoch": 0.7947435353963543, + "grad_norm": 0.9400314193832459, + "learning_rate": 2.1281651939094996e-06, + "loss": 0.9215, + "step": 4687 + }, + { + "epoch": 0.7949130987706655, + "grad_norm": 0.9653972117441191, + "learning_rate": 2.1247791794737827e-06, + "loss": 0.9233, + "step": 4688 + }, + { + "epoch": 0.7950826621449767, + "grad_norm": 0.9789152637556073, + "learning_rate": 2.1213955406222076e-06, + "loss": 0.9419, + "step": 4689 + }, + { + "epoch": 0.7952522255192879, + "grad_norm": 0.9351063773477509, + "learning_rate": 2.1180142783754565e-06, + "loss": 0.8875, + "step": 4690 + }, + { + "epoch": 0.7954217888935989, + "grad_norm": 0.9876075494532389, + "learning_rate": 2.1146353937534993e-06, + "loss": 0.9494, + "step": 4691 + }, + { + "epoch": 0.7955913522679101, + "grad_norm": 1.0366653502184795, + "learning_rate": 2.111258887775581e-06, + "loss": 0.9417, + "step": 4692 + }, + { + "epoch": 0.7957609156422213, + "grad_norm": 0.9608788663441238, + "learning_rate": 2.1078847614602437e-06, + "loss": 0.926, + "step": 4693 + }, + { + "epoch": 0.7959304790165325, + "grad_norm": 1.0094349104826035, + "learning_rate": 2.104513015825297e-06, + "loss": 0.8996, + "step": 4694 + }, + { + "epoch": 0.7961000423908435, + "grad_norm": 0.9748103477058154, + "learning_rate": 2.10114365188784e-06, + "loss": 0.932, + "step": 4695 + }, + { + "epoch": 0.7962696057651547, + "grad_norm": 0.9584480408794203, + "learning_rate": 2.097776670664251e-06, + "loss": 0.9142, + "step": 4696 + }, + { + "epoch": 0.7964391691394659, + "grad_norm": 0.9587778951120278, + "learning_rate": 2.09441207317019e-06, + "loss": 0.9519, + "step": 4697 + }, + { + "epoch": 0.7966087325137771, + "grad_norm": 0.9352151520045621, + "learning_rate": 2.091049860420599e-06, + "loss": 0.8854, + "step": 4698 + }, + { + "epoch": 0.7967782958880881, + "grad_norm": 1.0030555159339016, + "learning_rate": 2.0876900334296936e-06, + "loss": 0.9789, + "step": 4699 + }, + { + "epoch": 0.7969478592623993, + "grad_norm": 0.9419132146557154, + "learning_rate": 2.084332593210985e-06, + "loss": 0.9296, + "step": 4700 + }, + { + "epoch": 0.7971174226367105, + "grad_norm": 0.9692116882960738, + "learning_rate": 2.0809775407772505e-06, + "loss": 0.9232, + "step": 4701 + }, + { + "epoch": 0.7972869860110217, + "grad_norm": 0.9249619308759223, + "learning_rate": 2.0776248771405526e-06, + "loss": 0.8713, + "step": 4702 + }, + { + "epoch": 0.7974565493853327, + "grad_norm": 0.973262325677416, + "learning_rate": 2.0742746033122296e-06, + "loss": 0.9292, + "step": 4703 + }, + { + "epoch": 0.7976261127596439, + "grad_norm": 1.0177147650953429, + "learning_rate": 2.070926720302906e-06, + "loss": 0.9113, + "step": 4704 + }, + { + "epoch": 0.7977956761339551, + "grad_norm": 0.9649081215528689, + "learning_rate": 2.0675812291224796e-06, + "loss": 0.9009, + "step": 4705 + }, + { + "epoch": 0.7979652395082663, + "grad_norm": 0.9810061966011068, + "learning_rate": 2.064238130780125e-06, + "loss": 0.9119, + "step": 4706 + }, + { + "epoch": 0.7981348028825773, + "grad_norm": 0.9282221780095217, + "learning_rate": 2.0608974262843018e-06, + "loss": 0.8843, + "step": 4707 + }, + { + "epoch": 0.7983043662568885, + "grad_norm": 1.0638705566041549, + "learning_rate": 2.0575591166427433e-06, + "loss": 0.9466, + "step": 4708 + }, + { + "epoch": 0.7984739296311997, + "grad_norm": 0.9701446145620254, + "learning_rate": 2.0542232028624585e-06, + "loss": 0.9167, + "step": 4709 + }, + { + "epoch": 0.7986434930055109, + "grad_norm": 0.9869482864301755, + "learning_rate": 2.050889685949734e-06, + "loss": 0.9224, + "step": 4710 + }, + { + "epoch": 0.7988130563798219, + "grad_norm": 1.0060000680420302, + "learning_rate": 2.0475585669101415e-06, + "loss": 0.9188, + "step": 4711 + }, + { + "epoch": 0.7989826197541331, + "grad_norm": 0.9718750049975254, + "learning_rate": 2.0442298467485187e-06, + "loss": 0.9269, + "step": 4712 + }, + { + "epoch": 0.7991521831284443, + "grad_norm": 1.0189104140211012, + "learning_rate": 2.0409035264689857e-06, + "loss": 0.947, + "step": 4713 + }, + { + "epoch": 0.7993217465027554, + "grad_norm": 0.9612282930283035, + "learning_rate": 2.0375796070749366e-06, + "loss": 0.9229, + "step": 4714 + }, + { + "epoch": 0.7994913098770665, + "grad_norm": 0.9855196614464193, + "learning_rate": 2.034258089569041e-06, + "loss": 0.9431, + "step": 4715 + }, + { + "epoch": 0.7996608732513777, + "grad_norm": 1.0454491186056527, + "learning_rate": 2.030938974953245e-06, + "loss": 0.9624, + "step": 4716 + }, + { + "epoch": 0.7998304366256889, + "grad_norm": 1.0116574038257442, + "learning_rate": 2.027622264228768e-06, + "loss": 0.9288, + "step": 4717 + }, + { + "epoch": 0.8, + "grad_norm": 0.999263751826078, + "learning_rate": 2.024307958396109e-06, + "loss": 0.9356, + "step": 4718 + }, + { + "epoch": 0.8001695633743111, + "grad_norm": 0.9843516105114978, + "learning_rate": 2.020996058455038e-06, + "loss": 0.9114, + "step": 4719 + }, + { + "epoch": 0.8003391267486223, + "grad_norm": 0.9625613349652331, + "learning_rate": 2.017686565404597e-06, + "loss": 0.9177, + "step": 4720 + }, + { + "epoch": 0.8005086901229335, + "grad_norm": 0.9667283576725946, + "learning_rate": 2.014379480243105e-06, + "loss": 0.9356, + "step": 4721 + }, + { + "epoch": 0.8006782534972446, + "grad_norm": 1.0103221123897892, + "learning_rate": 2.0110748039681573e-06, + "loss": 0.909, + "step": 4722 + }, + { + "epoch": 0.8008478168715557, + "grad_norm": 0.970450649918271, + "learning_rate": 2.0077725375766175e-06, + "loss": 0.9373, + "step": 4723 + }, + { + "epoch": 0.8010173802458669, + "grad_norm": 0.9651817633746508, + "learning_rate": 2.004472682064622e-06, + "loss": 0.8974, + "step": 4724 + }, + { + "epoch": 0.8011869436201781, + "grad_norm": 0.981208472608705, + "learning_rate": 2.0011752384275862e-06, + "loss": 0.9323, + "step": 4725 + }, + { + "epoch": 0.8013565069944892, + "grad_norm": 0.9601056681389586, + "learning_rate": 1.9978802076601934e-06, + "loss": 0.9059, + "step": 4726 + }, + { + "epoch": 0.8015260703688003, + "grad_norm": 0.9487048034399439, + "learning_rate": 1.994587590756397e-06, + "loss": 0.9184, + "step": 4727 + }, + { + "epoch": 0.8016956337431115, + "grad_norm": 0.9769453971356413, + "learning_rate": 1.9912973887094246e-06, + "loss": 0.954, + "step": 4728 + }, + { + "epoch": 0.8018651971174227, + "grad_norm": 0.9165536290195544, + "learning_rate": 1.988009602511779e-06, + "loss": 0.9094, + "step": 4729 + }, + { + "epoch": 0.8020347604917338, + "grad_norm": 0.9633130725762168, + "learning_rate": 1.9847242331552285e-06, + "loss": 0.9183, + "step": 4730 + }, + { + "epoch": 0.8022043238660449, + "grad_norm": 0.9963984385591905, + "learning_rate": 1.981441281630816e-06, + "loss": 0.9423, + "step": 4731 + }, + { + "epoch": 0.8023738872403561, + "grad_norm": 1.0049660232386082, + "learning_rate": 1.9781607489288524e-06, + "loss": 0.9349, + "step": 4732 + }, + { + "epoch": 0.8025434506146673, + "grad_norm": 0.9843477907829624, + "learning_rate": 1.9748826360389216e-06, + "loss": 0.9509, + "step": 4733 + }, + { + "epoch": 0.8027130139889784, + "grad_norm": 1.0141658707565955, + "learning_rate": 1.971606943949872e-06, + "loss": 0.9263, + "step": 4734 + }, + { + "epoch": 0.8028825773632895, + "grad_norm": 0.9649612059454581, + "learning_rate": 1.9683336736498326e-06, + "loss": 0.883, + "step": 4735 + }, + { + "epoch": 0.8030521407376007, + "grad_norm": 0.9498566631837178, + "learning_rate": 1.965062826126192e-06, + "loss": 0.9296, + "step": 4736 + }, + { + "epoch": 0.8032217041119118, + "grad_norm": 0.9287207625370278, + "learning_rate": 1.961794402365611e-06, + "loss": 0.9139, + "step": 4737 + }, + { + "epoch": 0.803391267486223, + "grad_norm": 0.9950462719813227, + "learning_rate": 1.9585284033540197e-06, + "loss": 0.9031, + "step": 4738 + }, + { + "epoch": 0.8035608308605341, + "grad_norm": 0.9944065331843382, + "learning_rate": 1.955264830076614e-06, + "loss": 0.9566, + "step": 4739 + }, + { + "epoch": 0.8037303942348453, + "grad_norm": 1.0612649194383519, + "learning_rate": 1.9520036835178667e-06, + "loss": 0.9035, + "step": 4740 + }, + { + "epoch": 0.8038999576091564, + "grad_norm": 1.0672855654527342, + "learning_rate": 1.9487449646615087e-06, + "loss": 0.9329, + "step": 4741 + }, + { + "epoch": 0.8040695209834676, + "grad_norm": 0.9931615533220247, + "learning_rate": 1.94548867449054e-06, + "loss": 0.9488, + "step": 4742 + }, + { + "epoch": 0.8042390843577787, + "grad_norm": 0.9803644811657122, + "learning_rate": 1.942234813987236e-06, + "loss": 0.9415, + "step": 4743 + }, + { + "epoch": 0.8044086477320899, + "grad_norm": 1.010334356645439, + "learning_rate": 1.9389833841331306e-06, + "loss": 0.9499, + "step": 4744 + }, + { + "epoch": 0.804578211106401, + "grad_norm": 0.9647480940687069, + "learning_rate": 1.935734385909028e-06, + "loss": 0.911, + "step": 4745 + }, + { + "epoch": 0.8047477744807122, + "grad_norm": 0.9807356795805566, + "learning_rate": 1.932487820294995e-06, + "loss": 0.9495, + "step": 4746 + }, + { + "epoch": 0.8049173378550233, + "grad_norm": 0.9542263453922717, + "learning_rate": 1.9292436882703735e-06, + "loss": 0.8983, + "step": 4747 + }, + { + "epoch": 0.8050869012293345, + "grad_norm": 0.98025254757285, + "learning_rate": 1.926001990813763e-06, + "loss": 0.9196, + "step": 4748 + }, + { + "epoch": 0.8052564646036456, + "grad_norm": 0.9753260204686327, + "learning_rate": 1.9227627289030315e-06, + "loss": 0.899, + "step": 4749 + }, + { + "epoch": 0.8054260279779568, + "grad_norm": 0.9779000833260253, + "learning_rate": 1.919525903515309e-06, + "loss": 0.9204, + "step": 4750 + }, + { + "epoch": 0.8055955913522679, + "grad_norm": 0.9326980957721716, + "learning_rate": 1.916291515626999e-06, + "loss": 0.9753, + "step": 4751 + }, + { + "epoch": 0.8057651547265791, + "grad_norm": 0.9469597139423694, + "learning_rate": 1.913059566213763e-06, + "loss": 0.9442, + "step": 4752 + }, + { + "epoch": 0.8059347181008902, + "grad_norm": 0.9747722745405499, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.9507, + "step": 4753 + }, + { + "epoch": 0.8061042814752014, + "grad_norm": 0.9461085853542899, + "learning_rate": 1.9066029867114822e-06, + "loss": 0.9341, + "step": 4754 + }, + { + "epoch": 0.8062738448495125, + "grad_norm": 1.0286523524797848, + "learning_rate": 1.9033783585700848e-06, + "loss": 0.9422, + "step": 4755 + }, + { + "epoch": 0.8064434082238237, + "grad_norm": 0.988493867489724, + "learning_rate": 1.9001561727990524e-06, + "loss": 0.9463, + "step": 4756 + }, + { + "epoch": 0.8066129715981348, + "grad_norm": 0.972605324692901, + "learning_rate": 1.8969364303703664e-06, + "loss": 0.9704, + "step": 4757 + }, + { + "epoch": 0.8067825349724459, + "grad_norm": 1.026823343133025, + "learning_rate": 1.8937191322552762e-06, + "loss": 0.9029, + "step": 4758 + }, + { + "epoch": 0.8069520983467571, + "grad_norm": 0.9411587249968623, + "learning_rate": 1.8905042794242857e-06, + "loss": 0.8702, + "step": 4759 + }, + { + "epoch": 0.8071216617210683, + "grad_norm": 0.9589204462788418, + "learning_rate": 1.8872918728471635e-06, + "loss": 0.9565, + "step": 4760 + }, + { + "epoch": 0.8072912250953794, + "grad_norm": 0.9650962709510975, + "learning_rate": 1.8840819134929467e-06, + "loss": 0.9305, + "step": 4761 + }, + { + "epoch": 0.8074607884696905, + "grad_norm": 0.986957929967007, + "learning_rate": 1.8808744023299263e-06, + "loss": 0.9608, + "step": 4762 + }, + { + "epoch": 0.8076303518440017, + "grad_norm": 0.951733607074917, + "learning_rate": 1.8776693403256585e-06, + "loss": 0.9168, + "step": 4763 + }, + { + "epoch": 0.8077999152183128, + "grad_norm": 0.9427834536601994, + "learning_rate": 1.8744667284469575e-06, + "loss": 0.9309, + "step": 4764 + }, + { + "epoch": 0.807969478592624, + "grad_norm": 0.95219258966223, + "learning_rate": 1.871266567659905e-06, + "loss": 0.895, + "step": 4765 + }, + { + "epoch": 0.8081390419669351, + "grad_norm": 1.0150858001959628, + "learning_rate": 1.8680688589298368e-06, + "loss": 0.9384, + "step": 4766 + }, + { + "epoch": 0.8083086053412463, + "grad_norm": 0.9661334003359936, + "learning_rate": 1.8648736032213521e-06, + "loss": 0.9022, + "step": 4767 + }, + { + "epoch": 0.8084781687155574, + "grad_norm": 0.9918694887957046, + "learning_rate": 1.8616808014983057e-06, + "loss": 0.9254, + "step": 4768 + }, + { + "epoch": 0.8086477320898686, + "grad_norm": 0.9400378081980244, + "learning_rate": 1.8584904547238214e-06, + "loss": 0.9168, + "step": 4769 + }, + { + "epoch": 0.8088172954641797, + "grad_norm": 0.9592460430470862, + "learning_rate": 1.8553025638602762e-06, + "loss": 0.9496, + "step": 4770 + }, + { + "epoch": 0.8089868588384909, + "grad_norm": 0.6852958235590761, + "learning_rate": 1.8521171298693042e-06, + "loss": 0.7791, + "step": 4771 + }, + { + "epoch": 0.809156422212802, + "grad_norm": 0.9953318296084129, + "learning_rate": 1.8489341537118021e-06, + "loss": 0.9639, + "step": 4772 + }, + { + "epoch": 0.8093259855871132, + "grad_norm": 0.5902330545525112, + "learning_rate": 1.8457536363479257e-06, + "loss": 0.7947, + "step": 4773 + }, + { + "epoch": 0.8094955489614243, + "grad_norm": 0.9500980248028972, + "learning_rate": 1.8425755787370869e-06, + "loss": 0.9524, + "step": 4774 + }, + { + "epoch": 0.8096651123357355, + "grad_norm": 0.9712167050616632, + "learning_rate": 1.8393999818379527e-06, + "loss": 0.9226, + "step": 4775 + }, + { + "epoch": 0.8098346757100466, + "grad_norm": 0.9957401480069289, + "learning_rate": 1.8362268466084577e-06, + "loss": 0.93, + "step": 4776 + }, + { + "epoch": 0.8100042390843578, + "grad_norm": 0.9904257801419981, + "learning_rate": 1.8330561740057839e-06, + "loss": 0.9347, + "step": 4777 + }, + { + "epoch": 0.8101738024586689, + "grad_norm": 0.9789113847761655, + "learning_rate": 1.8298879649863733e-06, + "loss": 0.9264, + "step": 4778 + }, + { + "epoch": 0.8103433658329801, + "grad_norm": 1.0034344426941375, + "learning_rate": 1.826722220505931e-06, + "loss": 0.9204, + "step": 4779 + }, + { + "epoch": 0.8105129292072912, + "grad_norm": 0.9749030656390125, + "learning_rate": 1.8235589415194089e-06, + "loss": 0.9125, + "step": 4780 + }, + { + "epoch": 0.8106824925816024, + "grad_norm": 0.985667145582083, + "learning_rate": 1.8203981289810212e-06, + "loss": 0.9293, + "step": 4781 + }, + { + "epoch": 0.8108520559559135, + "grad_norm": 0.9814106765867971, + "learning_rate": 1.8172397838442345e-06, + "loss": 0.9305, + "step": 4782 + }, + { + "epoch": 0.8110216193302247, + "grad_norm": 0.9468807243533414, + "learning_rate": 1.8140839070617765e-06, + "loss": 0.9227, + "step": 4783 + }, + { + "epoch": 0.8111911827045358, + "grad_norm": 0.994909327743116, + "learning_rate": 1.8109304995856247e-06, + "loss": 0.906, + "step": 4784 + }, + { + "epoch": 0.811360746078847, + "grad_norm": 0.9923423759762594, + "learning_rate": 1.8077795623670135e-06, + "loss": 0.9412, + "step": 4785 + }, + { + "epoch": 0.8115303094531581, + "grad_norm": 0.9658799119450953, + "learning_rate": 1.804631096356435e-06, + "loss": 0.9174, + "step": 4786 + }, + { + "epoch": 0.8116998728274692, + "grad_norm": 0.6558538099505516, + "learning_rate": 1.8014851025036329e-06, + "loss": 0.7908, + "step": 4787 + }, + { + "epoch": 0.8118694362017804, + "grad_norm": 0.9436291218934865, + "learning_rate": 1.7983415817576044e-06, + "loss": 0.915, + "step": 4788 + }, + { + "epoch": 0.8120389995760916, + "grad_norm": 0.9418942397454481, + "learning_rate": 1.7952005350666023e-06, + "loss": 0.9018, + "step": 4789 + }, + { + "epoch": 0.8122085629504027, + "grad_norm": 0.9110827827944578, + "learning_rate": 1.7920619633781332e-06, + "loss": 0.9273, + "step": 4790 + }, + { + "epoch": 0.8123781263247138, + "grad_norm": 0.9870548105748547, + "learning_rate": 1.7889258676389577e-06, + "loss": 0.9644, + "step": 4791 + }, + { + "epoch": 0.812547689699025, + "grad_norm": 0.9755421965284303, + "learning_rate": 1.7857922487950873e-06, + "loss": 0.8938, + "step": 4792 + }, + { + "epoch": 0.8127172530733362, + "grad_norm": 1.0061508546765001, + "learning_rate": 1.7826611077917843e-06, + "loss": 0.9369, + "step": 4793 + }, + { + "epoch": 0.8128868164476473, + "grad_norm": 0.9684683002304985, + "learning_rate": 1.779532445573574e-06, + "loss": 0.9117, + "step": 4794 + }, + { + "epoch": 0.8130563798219584, + "grad_norm": 0.9962328589287732, + "learning_rate": 1.7764062630842226e-06, + "loss": 0.9251, + "step": 4795 + }, + { + "epoch": 0.8132259431962696, + "grad_norm": 0.9540049900629931, + "learning_rate": 1.7732825612667503e-06, + "loss": 0.9302, + "step": 4796 + }, + { + "epoch": 0.8133955065705808, + "grad_norm": 0.9863235761950255, + "learning_rate": 1.7701613410634367e-06, + "loss": 0.9395, + "step": 4797 + }, + { + "epoch": 0.8135650699448919, + "grad_norm": 0.9886242855691526, + "learning_rate": 1.7670426034158039e-06, + "loss": 0.9406, + "step": 4798 + }, + { + "epoch": 0.813734633319203, + "grad_norm": 1.0281674291172385, + "learning_rate": 1.7639263492646298e-06, + "loss": 0.939, + "step": 4799 + }, + { + "epoch": 0.8139041966935142, + "grad_norm": 0.9823049664506998, + "learning_rate": 1.7608125795499386e-06, + "loss": 0.924, + "step": 4800 + }, + { + "epoch": 0.8140737600678254, + "grad_norm": 0.9733385794078995, + "learning_rate": 1.757701295211014e-06, + "loss": 0.9427, + "step": 4801 + }, + { + "epoch": 0.8142433234421365, + "grad_norm": 0.9894128000419272, + "learning_rate": 1.7545924971863804e-06, + "loss": 0.9011, + "step": 4802 + }, + { + "epoch": 0.8144128868164476, + "grad_norm": 1.0059160916454422, + "learning_rate": 1.7514861864138145e-06, + "loss": 0.9457, + "step": 4803 + }, + { + "epoch": 0.8145824501907588, + "grad_norm": 1.0159541314761331, + "learning_rate": 1.74838236383035e-06, + "loss": 0.9381, + "step": 4804 + }, + { + "epoch": 0.81475201356507, + "grad_norm": 0.9960947205135507, + "learning_rate": 1.74528103037226e-06, + "loss": 0.9156, + "step": 4805 + }, + { + "epoch": 0.814921576939381, + "grad_norm": 0.9622416591995683, + "learning_rate": 1.7421821869750732e-06, + "loss": 0.8923, + "step": 4806 + }, + { + "epoch": 0.8150911403136922, + "grad_norm": 0.9354823927744681, + "learning_rate": 1.739085834573564e-06, + "loss": 0.9126, + "step": 4807 + }, + { + "epoch": 0.8152607036880034, + "grad_norm": 1.0496368876410247, + "learning_rate": 1.735991974101756e-06, + "loss": 0.9701, + "step": 4808 + }, + { + "epoch": 0.8154302670623146, + "grad_norm": 0.9449191610223884, + "learning_rate": 1.7329006064929232e-06, + "loss": 0.9316, + "step": 4809 + }, + { + "epoch": 0.8155998304366257, + "grad_norm": 1.0275039639697148, + "learning_rate": 1.7298117326795838e-06, + "loss": 0.9138, + "step": 4810 + }, + { + "epoch": 0.8157693938109368, + "grad_norm": 0.981654738732103, + "learning_rate": 1.7267253535935057e-06, + "loss": 0.9087, + "step": 4811 + }, + { + "epoch": 0.815938957185248, + "grad_norm": 1.0400078485669713, + "learning_rate": 1.7236414701657067e-06, + "loss": 0.9751, + "step": 4812 + }, + { + "epoch": 0.8161085205595592, + "grad_norm": 0.9003217428781333, + "learning_rate": 1.7205600833264501e-06, + "loss": 0.8723, + "step": 4813 + }, + { + "epoch": 0.8162780839338702, + "grad_norm": 0.989271252893948, + "learning_rate": 1.7174811940052404e-06, + "loss": 0.9158, + "step": 4814 + }, + { + "epoch": 0.8164476473081814, + "grad_norm": 0.9098563778099721, + "learning_rate": 1.7144048031308414e-06, + "loss": 0.9235, + "step": 4815 + }, + { + "epoch": 0.8166172106824926, + "grad_norm": 0.9782634007564286, + "learning_rate": 1.7113309116312505e-06, + "loss": 0.9032, + "step": 4816 + }, + { + "epoch": 0.8167867740568038, + "grad_norm": 0.6349325824962954, + "learning_rate": 1.7082595204337183e-06, + "loss": 0.742, + "step": 4817 + }, + { + "epoch": 0.8169563374311148, + "grad_norm": 0.9747100475429629, + "learning_rate": 1.705190630464737e-06, + "loss": 0.9107, + "step": 4818 + }, + { + "epoch": 0.817125900805426, + "grad_norm": 0.9754332236128412, + "learning_rate": 1.7021242426500495e-06, + "loss": 0.9203, + "step": 4819 + }, + { + "epoch": 0.8172954641797372, + "grad_norm": 0.9812566911680318, + "learning_rate": 1.6990603579146391e-06, + "loss": 0.9023, + "step": 4820 + }, + { + "epoch": 0.8174650275540484, + "grad_norm": 0.958057701484152, + "learning_rate": 1.6959989771827346e-06, + "loss": 0.9066, + "step": 4821 + }, + { + "epoch": 0.8176345909283594, + "grad_norm": 0.9346876372465223, + "learning_rate": 1.6929401013778157e-06, + "loss": 0.925, + "step": 4822 + }, + { + "epoch": 0.8178041543026706, + "grad_norm": 1.0067900945594015, + "learning_rate": 1.6898837314225969e-06, + "loss": 0.9004, + "step": 4823 + }, + { + "epoch": 0.8179737176769818, + "grad_norm": 1.005496920176738, + "learning_rate": 1.6868298682390437e-06, + "loss": 0.9438, + "step": 4824 + }, + { + "epoch": 0.818143281051293, + "grad_norm": 1.0196362997833366, + "learning_rate": 1.683778512748362e-06, + "loss": 0.9382, + "step": 4825 + }, + { + "epoch": 0.818312844425604, + "grad_norm": 0.6867945510298027, + "learning_rate": 1.6807296658710038e-06, + "loss": 0.837, + "step": 4826 + }, + { + "epoch": 0.8184824077999152, + "grad_norm": 0.968737511817879, + "learning_rate": 1.6776833285266602e-06, + "loss": 0.895, + "step": 4827 + }, + { + "epoch": 0.8186519711742264, + "grad_norm": 0.964416935501097, + "learning_rate": 1.6746395016342708e-06, + "loss": 0.92, + "step": 4828 + }, + { + "epoch": 0.8188215345485376, + "grad_norm": 0.9796298690479828, + "learning_rate": 1.6715981861120112e-06, + "loss": 0.9131, + "step": 4829 + }, + { + "epoch": 0.8189910979228486, + "grad_norm": 0.9522268016283855, + "learning_rate": 1.6685593828773095e-06, + "loss": 0.9081, + "step": 4830 + }, + { + "epoch": 0.8191606612971598, + "grad_norm": 0.9845229378868322, + "learning_rate": 1.6655230928468257e-06, + "loss": 0.9231, + "step": 4831 + }, + { + "epoch": 0.819330224671471, + "grad_norm": 0.9597390804072871, + "learning_rate": 1.6624893169364641e-06, + "loss": 0.9333, + "step": 4832 + }, + { + "epoch": 0.8194997880457822, + "grad_norm": 0.9487069875424946, + "learning_rate": 1.6594580560613782e-06, + "loss": 0.9391, + "step": 4833 + }, + { + "epoch": 0.8196693514200932, + "grad_norm": 0.6473279012223865, + "learning_rate": 1.6564293111359541e-06, + "loss": 0.77, + "step": 4834 + }, + { + "epoch": 0.8198389147944044, + "grad_norm": 0.9996786144045007, + "learning_rate": 1.6534030830738223e-06, + "loss": 0.933, + "step": 4835 + }, + { + "epoch": 0.8200084781687156, + "grad_norm": 0.9351334734665407, + "learning_rate": 1.6503793727878493e-06, + "loss": 0.9011, + "step": 4836 + }, + { + "epoch": 0.8201780415430268, + "grad_norm": 0.9819576205406819, + "learning_rate": 1.6473581811901529e-06, + "loss": 0.934, + "step": 4837 + }, + { + "epoch": 0.8203476049173378, + "grad_norm": 0.9776310942287822, + "learning_rate": 1.6443395091920822e-06, + "loss": 0.9199, + "step": 4838 + }, + { + "epoch": 0.820517168291649, + "grad_norm": 0.9536046115177798, + "learning_rate": 1.6413233577042253e-06, + "loss": 0.9294, + "step": 4839 + }, + { + "epoch": 0.8206867316659602, + "grad_norm": 0.9680768397474379, + "learning_rate": 1.6383097276364202e-06, + "loss": 0.9061, + "step": 4840 + }, + { + "epoch": 0.8208562950402714, + "grad_norm": 0.609376517722487, + "learning_rate": 1.6352986198977327e-06, + "loss": 0.7629, + "step": 4841 + }, + { + "epoch": 0.8210258584145824, + "grad_norm": 1.009160569244628, + "learning_rate": 1.6322900353964732e-06, + "loss": 0.9107, + "step": 4842 + }, + { + "epoch": 0.8211954217888936, + "grad_norm": 0.6463859033489396, + "learning_rate": 1.6292839750401924e-06, + "loss": 0.7836, + "step": 4843 + }, + { + "epoch": 0.8213649851632048, + "grad_norm": 0.9703655318167864, + "learning_rate": 1.6262804397356747e-06, + "loss": 0.9364, + "step": 4844 + }, + { + "epoch": 0.821534548537516, + "grad_norm": 1.010602545527202, + "learning_rate": 1.6232794303889466e-06, + "loss": 0.9356, + "step": 4845 + }, + { + "epoch": 0.821704111911827, + "grad_norm": 0.5898944731230332, + "learning_rate": 1.6202809479052728e-06, + "loss": 0.7388, + "step": 4846 + }, + { + "epoch": 0.8218736752861382, + "grad_norm": 1.01095244106849, + "learning_rate": 1.617284993189151e-06, + "loss": 0.9488, + "step": 4847 + }, + { + "epoch": 0.8220432386604494, + "grad_norm": 0.9628062432037369, + "learning_rate": 1.6142915671443238e-06, + "loss": 0.9035, + "step": 4848 + }, + { + "epoch": 0.8222128020347604, + "grad_norm": 0.9694490357189731, + "learning_rate": 1.6113006706737667e-06, + "loss": 0.8903, + "step": 4849 + }, + { + "epoch": 0.8223823654090716, + "grad_norm": 1.056685661018095, + "learning_rate": 1.60831230467969e-06, + "loss": 0.9387, + "step": 4850 + }, + { + "epoch": 0.8225519287833828, + "grad_norm": 0.9836485925737894, + "learning_rate": 1.6053264700635474e-06, + "loss": 0.9324, + "step": 4851 + }, + { + "epoch": 0.822721492157694, + "grad_norm": 0.9616794407709552, + "learning_rate": 1.6023431677260215e-06, + "loss": 0.8904, + "step": 4852 + }, + { + "epoch": 0.822891055532005, + "grad_norm": 0.9889852040409204, + "learning_rate": 1.599362398567037e-06, + "loss": 0.9208, + "step": 4853 + }, + { + "epoch": 0.8230606189063162, + "grad_norm": 0.9709834492704014, + "learning_rate": 1.596384163485748e-06, + "loss": 0.9187, + "step": 4854 + }, + { + "epoch": 0.8232301822806274, + "grad_norm": 0.9874173204574763, + "learning_rate": 1.5934084633805536e-06, + "loss": 0.9121, + "step": 4855 + }, + { + "epoch": 0.8233997456549386, + "grad_norm": 1.023751431287385, + "learning_rate": 1.590435299149079e-06, + "loss": 0.9184, + "step": 4856 + }, + { + "epoch": 0.8235693090292496, + "grad_norm": 0.9556901891046856, + "learning_rate": 1.587464671688187e-06, + "loss": 0.8759, + "step": 4857 + }, + { + "epoch": 0.8237388724035608, + "grad_norm": 0.9480987982580044, + "learning_rate": 1.5844965818939806e-06, + "loss": 0.8633, + "step": 4858 + }, + { + "epoch": 0.823908435777872, + "grad_norm": 0.9637821483082929, + "learning_rate": 1.5815310306617914e-06, + "loss": 0.8703, + "step": 4859 + }, + { + "epoch": 0.8240779991521832, + "grad_norm": 0.9318774036591555, + "learning_rate": 1.5785680188861862e-06, + "loss": 0.8983, + "step": 4860 + }, + { + "epoch": 0.8242475625264942, + "grad_norm": 0.9716611389266541, + "learning_rate": 1.5756075474609667e-06, + "loss": 0.9194, + "step": 4861 + }, + { + "epoch": 0.8244171259008054, + "grad_norm": 0.9924323162563781, + "learning_rate": 1.5726496172791671e-06, + "loss": 0.919, + "step": 4862 + }, + { + "epoch": 0.8245866892751166, + "grad_norm": 0.9361311174503837, + "learning_rate": 1.5696942292330574e-06, + "loss": 0.9405, + "step": 4863 + }, + { + "epoch": 0.8247562526494278, + "grad_norm": 1.0355391555405462, + "learning_rate": 1.5667413842141377e-06, + "loss": 0.9002, + "step": 4864 + }, + { + "epoch": 0.8249258160237388, + "grad_norm": 0.9786538489118394, + "learning_rate": 1.563791083113142e-06, + "loss": 0.9266, + "step": 4865 + }, + { + "epoch": 0.82509537939805, + "grad_norm": 1.0118083016574306, + "learning_rate": 1.5608433268200418e-06, + "loss": 0.9399, + "step": 4866 + }, + { + "epoch": 0.8252649427723612, + "grad_norm": 0.9535172859201975, + "learning_rate": 1.5578981162240337e-06, + "loss": 0.9558, + "step": 4867 + }, + { + "epoch": 0.8254345061466724, + "grad_norm": 0.9893033245905223, + "learning_rate": 1.554955452213548e-06, + "loss": 0.9359, + "step": 4868 + }, + { + "epoch": 0.8256040695209834, + "grad_norm": 0.9570735510047722, + "learning_rate": 1.5520153356762514e-06, + "loss": 0.9077, + "step": 4869 + }, + { + "epoch": 0.8257736328952946, + "grad_norm": 1.0246930199545017, + "learning_rate": 1.5490777674990376e-06, + "loss": 0.9371, + "step": 4870 + }, + { + "epoch": 0.8259431962696058, + "grad_norm": 0.9431737353875405, + "learning_rate": 1.5461427485680336e-06, + "loss": 0.9469, + "step": 4871 + }, + { + "epoch": 0.826112759643917, + "grad_norm": 1.0339554508298139, + "learning_rate": 1.5432102797685922e-06, + "loss": 0.9553, + "step": 4872 + }, + { + "epoch": 0.826282323018228, + "grad_norm": 0.967358767980075, + "learning_rate": 1.540280361985308e-06, + "loss": 0.8975, + "step": 4873 + }, + { + "epoch": 0.8264518863925392, + "grad_norm": 1.0003183024050315, + "learning_rate": 1.5373529961019972e-06, + "loss": 0.9638, + "step": 4874 + }, + { + "epoch": 0.8266214497668504, + "grad_norm": 0.9159703449042248, + "learning_rate": 1.534428183001705e-06, + "loss": 0.9161, + "step": 4875 + }, + { + "epoch": 0.8267910131411615, + "grad_norm": 0.9694259077321321, + "learning_rate": 1.5315059235667161e-06, + "loss": 0.9299, + "step": 4876 + }, + { + "epoch": 0.8269605765154726, + "grad_norm": 0.9385417863702417, + "learning_rate": 1.528586218678535e-06, + "loss": 0.9158, + "step": 4877 + }, + { + "epoch": 0.8271301398897838, + "grad_norm": 0.9847733491247092, + "learning_rate": 1.5256690692179011e-06, + "loss": 0.9137, + "step": 4878 + }, + { + "epoch": 0.827299703264095, + "grad_norm": 1.0301076951117067, + "learning_rate": 1.5227544760647805e-06, + "loss": 0.9422, + "step": 4879 + }, + { + "epoch": 0.8274692666384061, + "grad_norm": 0.9518676501888206, + "learning_rate": 1.5198424400983692e-06, + "loss": 0.91, + "step": 4880 + }, + { + "epoch": 0.8276388300127172, + "grad_norm": 1.019618675263682, + "learning_rate": 1.5169329621970918e-06, + "loss": 0.9485, + "step": 4881 + }, + { + "epoch": 0.8278083933870284, + "grad_norm": 0.9521386133644539, + "learning_rate": 1.514026043238598e-06, + "loss": 0.957, + "step": 4882 + }, + { + "epoch": 0.8279779567613396, + "grad_norm": 0.9921188141718126, + "learning_rate": 1.5111216840997745e-06, + "loss": 0.9399, + "step": 4883 + }, + { + "epoch": 0.8281475201356507, + "grad_norm": 0.9563047570235461, + "learning_rate": 1.5082198856567265e-06, + "loss": 0.8918, + "step": 4884 + }, + { + "epoch": 0.8283170835099618, + "grad_norm": 0.702179174404355, + "learning_rate": 1.5053206487847916e-06, + "loss": 0.8378, + "step": 4885 + }, + { + "epoch": 0.828486646884273, + "grad_norm": 0.9808465524386978, + "learning_rate": 1.5024239743585301e-06, + "loss": 0.9276, + "step": 4886 + }, + { + "epoch": 0.8286562102585842, + "grad_norm": 0.974430369560909, + "learning_rate": 1.4995298632517374e-06, + "loss": 0.9074, + "step": 4887 + }, + { + "epoch": 0.8288257736328953, + "grad_norm": 0.9524635652761271, + "learning_rate": 1.4966383163374288e-06, + "loss": 0.9249, + "step": 4888 + }, + { + "epoch": 0.8289953370072064, + "grad_norm": 0.9603500943050038, + "learning_rate": 1.4937493344878474e-06, + "loss": 0.8669, + "step": 4889 + }, + { + "epoch": 0.8291649003815176, + "grad_norm": 0.952715669585621, + "learning_rate": 1.4908629185744617e-06, + "loss": 0.8949, + "step": 4890 + }, + { + "epoch": 0.8293344637558288, + "grad_norm": 0.9670199946840378, + "learning_rate": 1.487979069467972e-06, + "loss": 0.8651, + "step": 4891 + }, + { + "epoch": 0.8295040271301399, + "grad_norm": 0.5795050208616122, + "learning_rate": 1.4850977880382977e-06, + "loss": 0.7673, + "step": 4892 + }, + { + "epoch": 0.829673590504451, + "grad_norm": 0.9768997812905504, + "learning_rate": 1.482219075154585e-06, + "loss": 0.9557, + "step": 4893 + }, + { + "epoch": 0.8298431538787622, + "grad_norm": 0.9920070573106412, + "learning_rate": 1.4793429316852092e-06, + "loss": 0.9376, + "step": 4894 + }, + { + "epoch": 0.8300127172530734, + "grad_norm": 0.9563862719736099, + "learning_rate": 1.4764693584977663e-06, + "loss": 0.9583, + "step": 4895 + }, + { + "epoch": 0.8301822806273845, + "grad_norm": 0.9962689462156771, + "learning_rate": 1.4735983564590784e-06, + "loss": 0.9428, + "step": 4896 + }, + { + "epoch": 0.8303518440016956, + "grad_norm": 1.0030443357978136, + "learning_rate": 1.4707299264351914e-06, + "loss": 0.9446, + "step": 4897 + }, + { + "epoch": 0.8305214073760068, + "grad_norm": 1.0019083955092671, + "learning_rate": 1.467864069291376e-06, + "loss": 0.9058, + "step": 4898 + }, + { + "epoch": 0.830690970750318, + "grad_norm": 0.9380533336191916, + "learning_rate": 1.4650007858921279e-06, + "loss": 0.9111, + "step": 4899 + }, + { + "epoch": 0.8308605341246291, + "grad_norm": 0.9841045475935494, + "learning_rate": 1.4621400771011607e-06, + "loss": 0.9398, + "step": 4900 + }, + { + "epoch": 0.8310300974989402, + "grad_norm": 0.9983828334245135, + "learning_rate": 1.459281943781422e-06, + "loss": 0.9174, + "step": 4901 + }, + { + "epoch": 0.8311996608732514, + "grad_norm": 1.0282451440043119, + "learning_rate": 1.4564263867950733e-06, + "loss": 0.9406, + "step": 4902 + }, + { + "epoch": 0.8313692242475625, + "grad_norm": 0.9264131165546978, + "learning_rate": 1.4535734070035024e-06, + "loss": 0.8977, + "step": 4903 + }, + { + "epoch": 0.8315387876218737, + "grad_norm": 0.9696854041122743, + "learning_rate": 1.450723005267317e-06, + "loss": 0.9192, + "step": 4904 + }, + { + "epoch": 0.8317083509961848, + "grad_norm": 1.016881718587947, + "learning_rate": 1.4478751824463543e-06, + "loss": 0.9723, + "step": 4905 + }, + { + "epoch": 0.831877914370496, + "grad_norm": 0.9699394048370444, + "learning_rate": 1.4450299393996647e-06, + "loss": 0.9263, + "step": 4906 + }, + { + "epoch": 0.8320474777448071, + "grad_norm": 0.9559044181349066, + "learning_rate": 1.4421872769855262e-06, + "loss": 0.8792, + "step": 4907 + }, + { + "epoch": 0.8322170411191183, + "grad_norm": 0.9834967046558155, + "learning_rate": 1.4393471960614336e-06, + "loss": 0.9072, + "step": 4908 + }, + { + "epoch": 0.8323866044934294, + "grad_norm": 0.9586249063145813, + "learning_rate": 1.436509697484111e-06, + "loss": 0.9072, + "step": 4909 + }, + { + "epoch": 0.8325561678677406, + "grad_norm": 0.9479908156064399, + "learning_rate": 1.4336747821094942e-06, + "loss": 0.9268, + "step": 4910 + }, + { + "epoch": 0.8327257312420517, + "grad_norm": 1.013493554150115, + "learning_rate": 1.4308424507927442e-06, + "loss": 0.9574, + "step": 4911 + }, + { + "epoch": 0.8328952946163629, + "grad_norm": 0.9874573443113831, + "learning_rate": 1.4280127043882452e-06, + "loss": 0.9287, + "step": 4912 + }, + { + "epoch": 0.833064857990674, + "grad_norm": 0.9521073735529032, + "learning_rate": 1.4251855437495976e-06, + "loss": 0.9297, + "step": 4913 + }, + { + "epoch": 0.8332344213649852, + "grad_norm": 1.0392229429675954, + "learning_rate": 1.4223609697296214e-06, + "loss": 0.9652, + "step": 4914 + }, + { + "epoch": 0.8334039847392963, + "grad_norm": 0.9995407037311419, + "learning_rate": 1.4195389831803596e-06, + "loss": 0.9432, + "step": 4915 + }, + { + "epoch": 0.8335735481136075, + "grad_norm": 0.9640359212398271, + "learning_rate": 1.416719584953069e-06, + "loss": 0.8951, + "step": 4916 + }, + { + "epoch": 0.8337431114879186, + "grad_norm": 0.999230910967728, + "learning_rate": 1.413902775898236e-06, + "loss": 0.9116, + "step": 4917 + }, + { + "epoch": 0.8339126748622298, + "grad_norm": 0.6102136717163895, + "learning_rate": 1.4110885568655564e-06, + "loss": 0.7349, + "step": 4918 + }, + { + "epoch": 0.8340822382365409, + "grad_norm": 0.9604053185813455, + "learning_rate": 1.4082769287039465e-06, + "loss": 0.9449, + "step": 4919 + }, + { + "epoch": 0.8342518016108521, + "grad_norm": 0.9571973842234587, + "learning_rate": 1.405467892261545e-06, + "loss": 0.8934, + "step": 4920 + }, + { + "epoch": 0.8344213649851632, + "grad_norm": 1.0191331904116319, + "learning_rate": 1.4026614483857037e-06, + "loss": 0.9309, + "step": 4921 + }, + { + "epoch": 0.8345909283594743, + "grad_norm": 1.006271078738936, + "learning_rate": 1.3998575979229944e-06, + "loss": 0.8823, + "step": 4922 + }, + { + "epoch": 0.8347604917337855, + "grad_norm": 0.9863906821835449, + "learning_rate": 1.3970563417192117e-06, + "loss": 0.8679, + "step": 4923 + }, + { + "epoch": 0.8349300551080967, + "grad_norm": 0.9768584301538424, + "learning_rate": 1.3942576806193597e-06, + "loss": 0.9066, + "step": 4924 + }, + { + "epoch": 0.8350996184824078, + "grad_norm": 0.9796722255798455, + "learning_rate": 1.391461615467663e-06, + "loss": 0.9128, + "step": 4925 + }, + { + "epoch": 0.835269181856719, + "grad_norm": 0.96682017045703, + "learning_rate": 1.3886681471075614e-06, + "loss": 0.9174, + "step": 4926 + }, + { + "epoch": 0.8354387452310301, + "grad_norm": 0.9884606016088618, + "learning_rate": 1.3858772763817174e-06, + "loss": 0.9356, + "step": 4927 + }, + { + "epoch": 0.8356083086053413, + "grad_norm": 1.0281187927452053, + "learning_rate": 1.3830890041320034e-06, + "loss": 0.9438, + "step": 4928 + }, + { + "epoch": 0.8357778719796524, + "grad_norm": 1.0082471503138797, + "learning_rate": 1.3803033311995072e-06, + "loss": 0.9299, + "step": 4929 + }, + { + "epoch": 0.8359474353539635, + "grad_norm": 1.0139703466929773, + "learning_rate": 1.3775202584245407e-06, + "loss": 0.9123, + "step": 4930 + }, + { + "epoch": 0.8361169987282747, + "grad_norm": 0.9668120722955628, + "learning_rate": 1.374739786646624e-06, + "loss": 0.9238, + "step": 4931 + }, + { + "epoch": 0.8362865621025859, + "grad_norm": 0.9804851628398163, + "learning_rate": 1.371961916704494e-06, + "loss": 0.9197, + "step": 4932 + }, + { + "epoch": 0.836456125476897, + "grad_norm": 1.0597314863763998, + "learning_rate": 1.3691866494361029e-06, + "loss": 0.9174, + "step": 4933 + }, + { + "epoch": 0.8366256888512081, + "grad_norm": 0.9233662667103956, + "learning_rate": 1.3664139856786207e-06, + "loss": 0.9025, + "step": 4934 + }, + { + "epoch": 0.8367952522255193, + "grad_norm": 0.9428850174027956, + "learning_rate": 1.3636439262684299e-06, + "loss": 0.9014, + "step": 4935 + }, + { + "epoch": 0.8369648155998305, + "grad_norm": 0.901665153353305, + "learning_rate": 1.3608764720411249e-06, + "loss": 0.8708, + "step": 4936 + }, + { + "epoch": 0.8371343789741416, + "grad_norm": 1.0132756975610824, + "learning_rate": 1.3581116238315194e-06, + "loss": 0.9266, + "step": 4937 + }, + { + "epoch": 0.8373039423484527, + "grad_norm": 1.0121643889877587, + "learning_rate": 1.3553493824736352e-06, + "loss": 0.9319, + "step": 4938 + }, + { + "epoch": 0.8374735057227639, + "grad_norm": 0.9851845851018192, + "learning_rate": 1.3525897488007134e-06, + "loss": 0.9114, + "step": 4939 + }, + { + "epoch": 0.837643069097075, + "grad_norm": 0.9729350990858207, + "learning_rate": 1.3498327236452013e-06, + "loss": 0.9263, + "step": 4940 + }, + { + "epoch": 0.8378126324713862, + "grad_norm": 0.9393086894425914, + "learning_rate": 1.3470783078387705e-06, + "loss": 0.9441, + "step": 4941 + }, + { + "epoch": 0.8379821958456973, + "grad_norm": 0.9414341135615347, + "learning_rate": 1.3443265022122952e-06, + "loss": 0.9392, + "step": 4942 + }, + { + "epoch": 0.8381517592200085, + "grad_norm": 0.9431857808933181, + "learning_rate": 1.341577307595867e-06, + "loss": 0.9128, + "step": 4943 + }, + { + "epoch": 0.8383213225943196, + "grad_norm": 1.0252996872904543, + "learning_rate": 1.3388307248187849e-06, + "loss": 0.9408, + "step": 4944 + }, + { + "epoch": 0.8384908859686308, + "grad_norm": 1.0260337618074395, + "learning_rate": 1.336086754709569e-06, + "loss": 0.9224, + "step": 4945 + }, + { + "epoch": 0.8386604493429419, + "grad_norm": 0.9901928693582054, + "learning_rate": 1.3333453980959455e-06, + "loss": 0.9395, + "step": 4946 + }, + { + "epoch": 0.8388300127172531, + "grad_norm": 0.978192593910684, + "learning_rate": 1.330606655804848e-06, + "loss": 0.9192, + "step": 4947 + }, + { + "epoch": 0.8389995760915642, + "grad_norm": 1.003724616434047, + "learning_rate": 1.3278705286624328e-06, + "loss": 0.9197, + "step": 4948 + }, + { + "epoch": 0.8391691394658753, + "grad_norm": 0.9725596652530581, + "learning_rate": 1.3251370174940582e-06, + "loss": 0.8885, + "step": 4949 + }, + { + "epoch": 0.8393387028401865, + "grad_norm": 0.9704560395108309, + "learning_rate": 1.3224061231242946e-06, + "loss": 0.9078, + "step": 4950 + }, + { + "epoch": 0.8395082662144977, + "grad_norm": 1.0143817881479997, + "learning_rate": 1.3196778463769256e-06, + "loss": 0.9369, + "step": 4951 + }, + { + "epoch": 0.8396778295888088, + "grad_norm": 1.0195787508483363, + "learning_rate": 1.316952188074946e-06, + "loss": 0.9061, + "step": 4952 + }, + { + "epoch": 0.8398473929631199, + "grad_norm": 0.9991372709615308, + "learning_rate": 1.3142291490405568e-06, + "loss": 0.9062, + "step": 4953 + }, + { + "epoch": 0.8400169563374311, + "grad_norm": 0.9486706313893987, + "learning_rate": 1.3115087300951711e-06, + "loss": 0.8833, + "step": 4954 + }, + { + "epoch": 0.8401865197117423, + "grad_norm": 0.9933496215537114, + "learning_rate": 1.3087909320594128e-06, + "loss": 0.9239, + "step": 4955 + }, + { + "epoch": 0.8403560830860534, + "grad_norm": 0.9948770760026168, + "learning_rate": 1.3060757557531124e-06, + "loss": 0.9095, + "step": 4956 + }, + { + "epoch": 0.8405256464603645, + "grad_norm": 1.0165086880868526, + "learning_rate": 1.3033632019953113e-06, + "loss": 0.9435, + "step": 4957 + }, + { + "epoch": 0.8406952098346757, + "grad_norm": 0.9825165424599075, + "learning_rate": 1.3006532716042575e-06, + "loss": 0.8781, + "step": 4958 + }, + { + "epoch": 0.8408647732089869, + "grad_norm": 0.97443457630941, + "learning_rate": 1.2979459653974146e-06, + "loss": 0.9411, + "step": 4959 + }, + { + "epoch": 0.841034336583298, + "grad_norm": 0.9889189897518945, + "learning_rate": 1.2952412841914474e-06, + "loss": 0.9378, + "step": 4960 + }, + { + "epoch": 0.8412038999576091, + "grad_norm": 1.0208831118651156, + "learning_rate": 1.2925392288022299e-06, + "loss": 0.941, + "step": 4961 + }, + { + "epoch": 0.8413734633319203, + "grad_norm": 0.9943332742831625, + "learning_rate": 1.2898398000448441e-06, + "loss": 0.9514, + "step": 4962 + }, + { + "epoch": 0.8415430267062315, + "grad_norm": 0.94668515723054, + "learning_rate": 1.2871429987335858e-06, + "loss": 0.9004, + "step": 4963 + }, + { + "epoch": 0.8417125900805426, + "grad_norm": 0.9558742788248252, + "learning_rate": 1.2844488256819497e-06, + "loss": 0.9058, + "step": 4964 + }, + { + "epoch": 0.8418821534548537, + "grad_norm": 0.973229122560861, + "learning_rate": 1.2817572817026402e-06, + "loss": 0.9315, + "step": 4965 + }, + { + "epoch": 0.8420517168291649, + "grad_norm": 1.0854281586788086, + "learning_rate": 1.2790683676075732e-06, + "loss": 0.9091, + "step": 4966 + }, + { + "epoch": 0.8422212802034761, + "grad_norm": 1.0249099048024033, + "learning_rate": 1.2763820842078657e-06, + "loss": 0.9531, + "step": 4967 + }, + { + "epoch": 0.8423908435777872, + "grad_norm": 0.9648933247195882, + "learning_rate": 1.2736984323138435e-06, + "loss": 0.9544, + "step": 4968 + }, + { + "epoch": 0.8425604069520983, + "grad_norm": 0.9168651190565301, + "learning_rate": 1.2710174127350362e-06, + "loss": 0.9141, + "step": 4969 + }, + { + "epoch": 0.8427299703264095, + "grad_norm": 0.9651421591225124, + "learning_rate": 1.2683390262801853e-06, + "loss": 0.8777, + "step": 4970 + }, + { + "epoch": 0.8428995337007207, + "grad_norm": 0.9875249691452004, + "learning_rate": 1.2656632737572327e-06, + "loss": 0.9144, + "step": 4971 + }, + { + "epoch": 0.8430690970750317, + "grad_norm": 0.6142240460768515, + "learning_rate": 1.262990155973327e-06, + "loss": 0.7395, + "step": 4972 + }, + { + "epoch": 0.8432386604493429, + "grad_norm": 0.9512273713004991, + "learning_rate": 1.2603196737348211e-06, + "loss": 0.9254, + "step": 4973 + }, + { + "epoch": 0.8434082238236541, + "grad_norm": 1.0182518950407085, + "learning_rate": 1.257651827847276e-06, + "loss": 0.9026, + "step": 4974 + }, + { + "epoch": 0.8435777871979653, + "grad_norm": 1.0303991261657348, + "learning_rate": 1.2549866191154547e-06, + "loss": 0.8844, + "step": 4975 + }, + { + "epoch": 0.8437473505722763, + "grad_norm": 1.0234561465497893, + "learning_rate": 1.252324048343322e-06, + "loss": 0.8939, + "step": 4976 + }, + { + "epoch": 0.8439169139465875, + "grad_norm": 0.9713062809278666, + "learning_rate": 1.2496641163340562e-06, + "loss": 0.9026, + "step": 4977 + }, + { + "epoch": 0.8440864773208987, + "grad_norm": 0.986330491605456, + "learning_rate": 1.2470068238900323e-06, + "loss": 0.9265, + "step": 4978 + }, + { + "epoch": 0.8442560406952099, + "grad_norm": 0.6566579100299678, + "learning_rate": 1.2443521718128259e-06, + "loss": 0.8019, + "step": 4979 + }, + { + "epoch": 0.8444256040695209, + "grad_norm": 0.9870275002071415, + "learning_rate": 1.2417001609032275e-06, + "loss": 0.9497, + "step": 4980 + }, + { + "epoch": 0.8445951674438321, + "grad_norm": 1.0061971871093598, + "learning_rate": 1.2390507919612215e-06, + "loss": 0.9339, + "step": 4981 + }, + { + "epoch": 0.8447647308181433, + "grad_norm": 0.9775779735701265, + "learning_rate": 1.2364040657859976e-06, + "loss": 0.9211, + "step": 4982 + }, + { + "epoch": 0.8449342941924545, + "grad_norm": 1.0060622902477008, + "learning_rate": 1.233759983175946e-06, + "loss": 0.9286, + "step": 4983 + }, + { + "epoch": 0.8451038575667655, + "grad_norm": 0.979018507866352, + "learning_rate": 1.2311185449286677e-06, + "loss": 0.9753, + "step": 4984 + }, + { + "epoch": 0.8452734209410767, + "grad_norm": 0.9198706477023796, + "learning_rate": 1.2284797518409575e-06, + "loss": 0.9205, + "step": 4985 + }, + { + "epoch": 0.8454429843153879, + "grad_norm": 1.011939611350744, + "learning_rate": 1.2258436047088162e-06, + "loss": 0.9262, + "step": 4986 + }, + { + "epoch": 0.8456125476896991, + "grad_norm": 0.9394857394643923, + "learning_rate": 1.2232101043274437e-06, + "loss": 0.9058, + "step": 4987 + }, + { + "epoch": 0.8457821110640101, + "grad_norm": 0.9807670870589137, + "learning_rate": 1.220579251491245e-06, + "loss": 0.899, + "step": 4988 + }, + { + "epoch": 0.8459516744383213, + "grad_norm": 0.9710975257113879, + "learning_rate": 1.217951046993826e-06, + "loss": 0.9633, + "step": 4989 + }, + { + "epoch": 0.8461212378126325, + "grad_norm": 1.0472037859390761, + "learning_rate": 1.2153254916279899e-06, + "loss": 0.9604, + "step": 4990 + }, + { + "epoch": 0.8462908011869437, + "grad_norm": 0.9628090000004781, + "learning_rate": 1.2127025861857455e-06, + "loss": 0.9254, + "step": 4991 + }, + { + "epoch": 0.8464603645612547, + "grad_norm": 1.0037713136667632, + "learning_rate": 1.2100823314582989e-06, + "loss": 0.9151, + "step": 4992 + }, + { + "epoch": 0.8466299279355659, + "grad_norm": 1.0077026030991845, + "learning_rate": 1.2074647282360573e-06, + "loss": 0.935, + "step": 4993 + }, + { + "epoch": 0.8467994913098771, + "grad_norm": 0.9834971925879705, + "learning_rate": 1.2048497773086276e-06, + "loss": 0.9636, + "step": 4994 + }, + { + "epoch": 0.8469690546841883, + "grad_norm": 0.9323495899469612, + "learning_rate": 1.2022374794648229e-06, + "loss": 0.8843, + "step": 4995 + }, + { + "epoch": 0.8471386180584993, + "grad_norm": 0.9567302487230924, + "learning_rate": 1.199627835492646e-06, + "loss": 0.923, + "step": 4996 + }, + { + "epoch": 0.8473081814328105, + "grad_norm": 0.9758300805544459, + "learning_rate": 1.197020846179303e-06, + "loss": 0.9108, + "step": 4997 + }, + { + "epoch": 0.8474777448071217, + "grad_norm": 0.9379315815069381, + "learning_rate": 1.1944165123112051e-06, + "loss": 0.9141, + "step": 4998 + }, + { + "epoch": 0.8476473081814329, + "grad_norm": 0.9549278742172951, + "learning_rate": 1.1918148346739545e-06, + "loss": 0.9284, + "step": 4999 + }, + { + "epoch": 0.8478168715557439, + "grad_norm": 0.9691541797084137, + "learning_rate": 1.1892158140523546e-06, + "loss": 0.9101, + "step": 5000 + }, + { + "epoch": 0.8479864349300551, + "grad_norm": 0.9770643814016914, + "learning_rate": 1.1866194512304075e-06, + "loss": 0.9442, + "step": 5001 + }, + { + "epoch": 0.8481559983043663, + "grad_norm": 0.9278702655626347, + "learning_rate": 1.1840257469913163e-06, + "loss": 0.8945, + "step": 5002 + }, + { + "epoch": 0.8483255616786775, + "grad_norm": 0.9737224079464902, + "learning_rate": 1.1814347021174777e-06, + "loss": 0.935, + "step": 5003 + }, + { + "epoch": 0.8484951250529885, + "grad_norm": 1.0111504822565995, + "learning_rate": 1.1788463173904896e-06, + "loss": 0.9521, + "step": 5004 + }, + { + "epoch": 0.8486646884272997, + "grad_norm": 0.9920139851895824, + "learning_rate": 1.1762605935911432e-06, + "loss": 0.9271, + "step": 5005 + }, + { + "epoch": 0.8488342518016109, + "grad_norm": 0.9698127376309369, + "learning_rate": 1.1736775314994341e-06, + "loss": 0.9058, + "step": 5006 + }, + { + "epoch": 0.849003815175922, + "grad_norm": 1.0091131338837878, + "learning_rate": 1.1710971318945485e-06, + "loss": 0.92, + "step": 5007 + }, + { + "epoch": 0.8491733785502331, + "grad_norm": 0.8772068668172168, + "learning_rate": 1.1685193955548712e-06, + "loss": 0.8562, + "step": 5008 + }, + { + "epoch": 0.8493429419245443, + "grad_norm": 0.9707871856364261, + "learning_rate": 1.165944323257986e-06, + "loss": 1.0002, + "step": 5009 + }, + { + "epoch": 0.8495125052988555, + "grad_norm": 0.9973843428172391, + "learning_rate": 1.16337191578067e-06, + "loss": 0.9656, + "step": 5010 + }, + { + "epoch": 0.8496820686731666, + "grad_norm": 1.0352908351051708, + "learning_rate": 1.1608021738988973e-06, + "loss": 0.952, + "step": 5011 + }, + { + "epoch": 0.8498516320474777, + "grad_norm": 0.9539738447034236, + "learning_rate": 1.1582350983878365e-06, + "loss": 0.881, + "step": 5012 + }, + { + "epoch": 0.8500211954217889, + "grad_norm": 0.9609608253415943, + "learning_rate": 1.1556706900218572e-06, + "loss": 0.9386, + "step": 5013 + }, + { + "epoch": 0.8501907587961001, + "grad_norm": 0.9441463880293872, + "learning_rate": 1.1531089495745206e-06, + "loss": 0.8891, + "step": 5014 + }, + { + "epoch": 0.8503603221704112, + "grad_norm": 0.9918451684588193, + "learning_rate": 1.150549877818581e-06, + "loss": 0.9339, + "step": 5015 + }, + { + "epoch": 0.8505298855447223, + "grad_norm": 0.9197835009671675, + "learning_rate": 1.1479934755259924e-06, + "loss": 0.938, + "step": 5016 + }, + { + "epoch": 0.8506994489190335, + "grad_norm": 0.9664962209301132, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.9371, + "step": 5017 + }, + { + "epoch": 0.8508690122933447, + "grad_norm": 0.9460492858808486, + "learning_rate": 1.142888682414648e-06, + "loss": 0.9275, + "step": 5018 + }, + { + "epoch": 0.8510385756676558, + "grad_norm": 0.9530415317084844, + "learning_rate": 1.1403402931357655e-06, + "loss": 0.9105, + "step": 5019 + }, + { + "epoch": 0.8512081390419669, + "grad_norm": 1.0093988497385549, + "learning_rate": 1.1377945763999875e-06, + "loss": 0.9366, + "step": 5020 + }, + { + "epoch": 0.8513777024162781, + "grad_norm": 1.0869680924795317, + "learning_rate": 1.1352515329752345e-06, + "loss": 0.9472, + "step": 5021 + }, + { + "epoch": 0.8515472657905893, + "grad_norm": 0.9968130560263498, + "learning_rate": 1.1327111636286237e-06, + "loss": 0.963, + "step": 5022 + }, + { + "epoch": 0.8517168291649004, + "grad_norm": 0.9836133517952427, + "learning_rate": 1.1301734691264633e-06, + "loss": 0.938, + "step": 5023 + }, + { + "epoch": 0.8518863925392115, + "grad_norm": 0.9775731948677512, + "learning_rate": 1.1276384502342596e-06, + "loss": 0.8817, + "step": 5024 + }, + { + "epoch": 0.8520559559135227, + "grad_norm": 0.94942800199654, + "learning_rate": 1.125106107716708e-06, + "loss": 0.9302, + "step": 5025 + }, + { + "epoch": 0.8522255192878339, + "grad_norm": 0.9059019330196004, + "learning_rate": 1.122576442337696e-06, + "loss": 0.8911, + "step": 5026 + }, + { + "epoch": 0.852395082662145, + "grad_norm": 1.0257272814505092, + "learning_rate": 1.120049454860307e-06, + "loss": 0.907, + "step": 5027 + }, + { + "epoch": 0.8525646460364561, + "grad_norm": 0.9515118956153226, + "learning_rate": 1.1175251460468117e-06, + "loss": 0.9011, + "step": 5028 + }, + { + "epoch": 0.8527342094107673, + "grad_norm": 0.9751532162967015, + "learning_rate": 1.115003516658677e-06, + "loss": 0.9002, + "step": 5029 + }, + { + "epoch": 0.8529037727850785, + "grad_norm": 0.9633003561564547, + "learning_rate": 1.1124845674565577e-06, + "loss": 0.9401, + "step": 5030 + }, + { + "epoch": 0.8530733361593895, + "grad_norm": 0.9865426048591225, + "learning_rate": 1.1099682992003058e-06, + "loss": 0.9379, + "step": 5031 + }, + { + "epoch": 0.8532428995337007, + "grad_norm": 0.948565856806794, + "learning_rate": 1.1074547126489609e-06, + "loss": 0.9288, + "step": 5032 + }, + { + "epoch": 0.8534124629080119, + "grad_norm": 0.9817690244995123, + "learning_rate": 1.10494380856075e-06, + "loss": 0.9174, + "step": 5033 + }, + { + "epoch": 0.853582026282323, + "grad_norm": 0.9593227641583841, + "learning_rate": 1.1024355876931004e-06, + "loss": 0.9362, + "step": 5034 + }, + { + "epoch": 0.8537515896566341, + "grad_norm": 0.914609356229114, + "learning_rate": 1.099930050802621e-06, + "loss": 0.9356, + "step": 5035 + }, + { + "epoch": 0.8539211530309453, + "grad_norm": 0.9536917885637378, + "learning_rate": 1.0974271986451169e-06, + "loss": 0.8942, + "step": 5036 + }, + { + "epoch": 0.8540907164052565, + "grad_norm": 0.9699646014679921, + "learning_rate": 1.0949270319755768e-06, + "loss": 0.9163, + "step": 5037 + }, + { + "epoch": 0.8542602797795676, + "grad_norm": 0.9363367333687395, + "learning_rate": 1.0924295515481886e-06, + "loss": 0.9098, + "step": 5038 + }, + { + "epoch": 0.8544298431538787, + "grad_norm": 0.9553517547818056, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.933, + "step": 5039 + }, + { + "epoch": 0.8545994065281899, + "grad_norm": 0.9329261976014018, + "learning_rate": 1.0874426524325398e-06, + "loss": 0.8527, + "step": 5040 + }, + { + "epoch": 0.8547689699025011, + "grad_norm": 0.9672761525526294, + "learning_rate": 1.0849532352485903e-06, + "loss": 0.9329, + "step": 5041 + }, + { + "epoch": 0.8549385332768122, + "grad_norm": 0.957320375598468, + "learning_rate": 1.0824665073154196e-06, + "loss": 0.9199, + "step": 5042 + }, + { + "epoch": 0.8551080966511233, + "grad_norm": 0.9529171321970592, + "learning_rate": 1.0799824693831529e-06, + "loss": 0.9286, + "step": 5043 + }, + { + "epoch": 0.8552776600254345, + "grad_norm": 0.9558869517362364, + "learning_rate": 1.0775011222011078e-06, + "loss": 0.9204, + "step": 5044 + }, + { + "epoch": 0.8554472233997457, + "grad_norm": 0.6120500743838968, + "learning_rate": 1.075022466517791e-06, + "loss": 0.7522, + "step": 5045 + }, + { + "epoch": 0.8556167867740568, + "grad_norm": 0.9127718671865394, + "learning_rate": 1.0725465030808958e-06, + "loss": 0.8934, + "step": 5046 + }, + { + "epoch": 0.8557863501483679, + "grad_norm": 1.0310084597028693, + "learning_rate": 1.0700732326373042e-06, + "loss": 0.9544, + "step": 5047 + }, + { + "epoch": 0.8559559135226791, + "grad_norm": 0.9894276963012947, + "learning_rate": 1.0676026559330842e-06, + "loss": 0.9015, + "step": 5048 + }, + { + "epoch": 0.8561254768969903, + "grad_norm": 0.9570954490836404, + "learning_rate": 1.0651347737134965e-06, + "loss": 0.9528, + "step": 5049 + }, + { + "epoch": 0.8562950402713014, + "grad_norm": 0.9552637051064249, + "learning_rate": 1.062669586722983e-06, + "loss": 0.9047, + "step": 5050 + }, + { + "epoch": 0.8564646036456125, + "grad_norm": 0.9669956212822644, + "learning_rate": 1.0602070957051725e-06, + "loss": 0.8861, + "step": 5051 + }, + { + "epoch": 0.8566341670199237, + "grad_norm": 0.9956227487211226, + "learning_rate": 1.0577473014028872e-06, + "loss": 0.8881, + "step": 5052 + }, + { + "epoch": 0.8568037303942349, + "grad_norm": 0.9809285225589254, + "learning_rate": 1.0552902045581305e-06, + "loss": 0.9232, + "step": 5053 + }, + { + "epoch": 0.856973293768546, + "grad_norm": 0.993206320743409, + "learning_rate": 1.0528358059120913e-06, + "loss": 0.8922, + "step": 5054 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.9260263257570618, + "learning_rate": 1.0503841062051445e-06, + "loss": 0.9162, + "step": 5055 + }, + { + "epoch": 0.8573124205171683, + "grad_norm": 0.6330074201581573, + "learning_rate": 1.0479351061768584e-06, + "loss": 0.7974, + "step": 5056 + }, + { + "epoch": 0.8574819838914794, + "grad_norm": 0.9988409387609664, + "learning_rate": 1.0454888065659775e-06, + "loss": 0.9143, + "step": 5057 + }, + { + "epoch": 0.8576515472657906, + "grad_norm": 0.9799454972930053, + "learning_rate": 1.0430452081104369e-06, + "loss": 0.9238, + "step": 5058 + }, + { + "epoch": 0.8578211106401017, + "grad_norm": 0.9413935993757301, + "learning_rate": 1.040604311547353e-06, + "loss": 0.9183, + "step": 5059 + }, + { + "epoch": 0.8579906740144129, + "grad_norm": 1.0296562243864147, + "learning_rate": 1.038166117613032e-06, + "loss": 0.931, + "step": 5060 + }, + { + "epoch": 0.858160237388724, + "grad_norm": 1.0144033052136412, + "learning_rate": 1.0357306270429623e-06, + "loss": 0.9642, + "step": 5061 + }, + { + "epoch": 0.8583298007630352, + "grad_norm": 0.953876184537609, + "learning_rate": 1.0332978405718175e-06, + "loss": 0.9419, + "step": 5062 + }, + { + "epoch": 0.8584993641373463, + "grad_norm": 0.9400286309399865, + "learning_rate": 1.0308677589334526e-06, + "loss": 0.9517, + "step": 5063 + }, + { + "epoch": 0.8586689275116575, + "grad_norm": 0.948599186172546, + "learning_rate": 1.0284403828609113e-06, + "loss": 0.9143, + "step": 5064 + }, + { + "epoch": 0.8588384908859686, + "grad_norm": 0.9126977614520076, + "learning_rate": 1.0260157130864178e-06, + "loss": 0.9245, + "step": 5065 + }, + { + "epoch": 0.8590080542602798, + "grad_norm": 1.0104631532358312, + "learning_rate": 1.0235937503413795e-06, + "loss": 0.8879, + "step": 5066 + }, + { + "epoch": 0.8591776176345909, + "grad_norm": 0.9428781928449234, + "learning_rate": 1.021174495356393e-06, + "loss": 0.9164, + "step": 5067 + }, + { + "epoch": 0.8593471810089021, + "grad_norm": 0.989405194899477, + "learning_rate": 1.018757948861231e-06, + "loss": 0.9331, + "step": 5068 + }, + { + "epoch": 0.8595167443832132, + "grad_norm": 0.6407133517194157, + "learning_rate": 1.0163441115848506e-06, + "loss": 0.7512, + "step": 5069 + }, + { + "epoch": 0.8596863077575244, + "grad_norm": 1.0326855127397307, + "learning_rate": 1.013932984255397e-06, + "loss": 0.9031, + "step": 5070 + }, + { + "epoch": 0.8598558711318355, + "grad_norm": 0.932399265546685, + "learning_rate": 1.0115245676001917e-06, + "loss": 0.8816, + "step": 5071 + }, + { + "epoch": 0.8600254345061467, + "grad_norm": 0.9800511945841957, + "learning_rate": 1.0091188623457415e-06, + "loss": 0.9236, + "step": 5072 + }, + { + "epoch": 0.8601949978804578, + "grad_norm": 0.9746683454341022, + "learning_rate": 1.0067158692177325e-06, + "loss": 0.9453, + "step": 5073 + }, + { + "epoch": 0.860364561254769, + "grad_norm": 0.9955081226460942, + "learning_rate": 1.0043155889410382e-06, + "loss": 0.9577, + "step": 5074 + }, + { + "epoch": 0.8605341246290801, + "grad_norm": 0.9583981958171508, + "learning_rate": 1.0019180222397095e-06, + "loss": 0.9084, + "step": 5075 + }, + { + "epoch": 0.8607036880033913, + "grad_norm": 1.0317375977263978, + "learning_rate": 9.995231698369789e-07, + "loss": 0.971, + "step": 5076 + }, + { + "epoch": 0.8608732513777024, + "grad_norm": 0.9806402206391475, + "learning_rate": 9.971310324552597e-07, + "loss": 0.9334, + "step": 5077 + }, + { + "epoch": 0.8610428147520136, + "grad_norm": 0.9443382406010195, + "learning_rate": 9.9474161081615e-07, + "loss": 0.9284, + "step": 5078 + }, + { + "epoch": 0.8612123781263247, + "grad_norm": 0.9666104446729988, + "learning_rate": 9.923549056404247e-07, + "loss": 0.9268, + "step": 5079 + }, + { + "epoch": 0.8613819415006359, + "grad_norm": 0.9467338251715124, + "learning_rate": 9.899709176480398e-07, + "loss": 0.9227, + "step": 5080 + }, + { + "epoch": 0.861551504874947, + "grad_norm": 1.0284768922078236, + "learning_rate": 9.87589647558135e-07, + "loss": 0.9551, + "step": 5081 + }, + { + "epoch": 0.8617210682492582, + "grad_norm": 0.9733707674390222, + "learning_rate": 9.852110960890248e-07, + "loss": 0.9287, + "step": 5082 + }, + { + "epoch": 0.8618906316235693, + "grad_norm": 0.947050357571709, + "learning_rate": 9.828352639582073e-07, + "loss": 0.8879, + "step": 5083 + }, + { + "epoch": 0.8620601949978804, + "grad_norm": 0.9739477936521534, + "learning_rate": 9.804621518823587e-07, + "loss": 0.9124, + "step": 5084 + }, + { + "epoch": 0.8622297583721916, + "grad_norm": 1.0018942714344834, + "learning_rate": 9.780917605773376e-07, + "loss": 0.9376, + "step": 5085 + }, + { + "epoch": 0.8623993217465028, + "grad_norm": 0.9527752592911204, + "learning_rate": 9.75724090758179e-07, + "loss": 0.9055, + "step": 5086 + }, + { + "epoch": 0.8625688851208139, + "grad_norm": 0.9535126539181107, + "learning_rate": 9.733591431390955e-07, + "loss": 0.947, + "step": 5087 + }, + { + "epoch": 0.862738448495125, + "grad_norm": 0.9359181190575948, + "learning_rate": 9.709969184334832e-07, + "loss": 0.9032, + "step": 5088 + }, + { + "epoch": 0.8629080118694362, + "grad_norm": 0.9482045494639502, + "learning_rate": 9.686374173539147e-07, + "loss": 0.897, + "step": 5089 + }, + { + "epoch": 0.8630775752437474, + "grad_norm": 0.9693326013304164, + "learning_rate": 9.662806406121383e-07, + "loss": 0.9336, + "step": 5090 + }, + { + "epoch": 0.8632471386180585, + "grad_norm": 0.9689238817165925, + "learning_rate": 9.63926588919083e-07, + "loss": 0.9541, + "step": 5091 + }, + { + "epoch": 0.8634167019923696, + "grad_norm": 0.9765267973915636, + "learning_rate": 9.615752629848574e-07, + "loss": 0.9308, + "step": 5092 + }, + { + "epoch": 0.8635862653666808, + "grad_norm": 0.9888109493786731, + "learning_rate": 9.592266635187464e-07, + "loss": 0.9416, + "step": 5093 + }, + { + "epoch": 0.863755828740992, + "grad_norm": 0.9655935379937343, + "learning_rate": 9.568807912292077e-07, + "loss": 0.8973, + "step": 5094 + }, + { + "epoch": 0.8639253921153031, + "grad_norm": 0.9803580597876733, + "learning_rate": 9.545376468238864e-07, + "loss": 0.9219, + "step": 5095 + }, + { + "epoch": 0.8640949554896142, + "grad_norm": 0.9851122596695164, + "learning_rate": 9.521972310095973e-07, + "loss": 0.9155, + "step": 5096 + }, + { + "epoch": 0.8642645188639254, + "grad_norm": 0.9432599969636443, + "learning_rate": 9.49859544492332e-07, + "loss": 0.9497, + "step": 5097 + }, + { + "epoch": 0.8644340822382366, + "grad_norm": 0.9592869670580622, + "learning_rate": 9.47524587977261e-07, + "loss": 0.9604, + "step": 5098 + }, + { + "epoch": 0.8646036456125477, + "grad_norm": 0.9842954513786379, + "learning_rate": 9.451923621687343e-07, + "loss": 0.9145, + "step": 5099 + }, + { + "epoch": 0.8647732089868588, + "grad_norm": 0.9605784169678584, + "learning_rate": 9.428628677702711e-07, + "loss": 0.9002, + "step": 5100 + }, + { + "epoch": 0.86494277236117, + "grad_norm": 0.9988090173366728, + "learning_rate": 9.40536105484573e-07, + "loss": 0.8995, + "step": 5101 + }, + { + "epoch": 0.8651123357354812, + "grad_norm": 0.9773795024933145, + "learning_rate": 9.382120760135128e-07, + "loss": 0.8731, + "step": 5102 + }, + { + "epoch": 0.8652818991097923, + "grad_norm": 0.9303532450179893, + "learning_rate": 9.358907800581418e-07, + "loss": 0.9161, + "step": 5103 + }, + { + "epoch": 0.8654514624841034, + "grad_norm": 0.9920428859111302, + "learning_rate": 9.335722183186868e-07, + "loss": 0.8877, + "step": 5104 + }, + { + "epoch": 0.8656210258584146, + "grad_norm": 0.9720520982774197, + "learning_rate": 9.312563914945461e-07, + "loss": 0.9076, + "step": 5105 + }, + { + "epoch": 0.8657905892327258, + "grad_norm": 0.9798397963625498, + "learning_rate": 9.289433002842996e-07, + "loss": 0.9176, + "step": 5106 + }, + { + "epoch": 0.8659601526070368, + "grad_norm": 0.9399891292967149, + "learning_rate": 9.266329453856959e-07, + "loss": 0.8936, + "step": 5107 + }, + { + "epoch": 0.866129715981348, + "grad_norm": 1.013982693981116, + "learning_rate": 9.24325327495662e-07, + "loss": 0.9356, + "step": 5108 + }, + { + "epoch": 0.8662992793556592, + "grad_norm": 0.9810885175628435, + "learning_rate": 9.220204473102945e-07, + "loss": 0.9247, + "step": 5109 + }, + { + "epoch": 0.8664688427299704, + "grad_norm": 1.0110672639347154, + "learning_rate": 9.197183055248726e-07, + "loss": 0.9541, + "step": 5110 + }, + { + "epoch": 0.8666384061042814, + "grad_norm": 0.9692478686835012, + "learning_rate": 9.174189028338421e-07, + "loss": 0.9109, + "step": 5111 + }, + { + "epoch": 0.8668079694785926, + "grad_norm": 0.9652561634163828, + "learning_rate": 9.151222399308213e-07, + "loss": 0.9135, + "step": 5112 + }, + { + "epoch": 0.8669775328529038, + "grad_norm": 0.9676494873901076, + "learning_rate": 9.128283175086106e-07, + "loss": 0.9315, + "step": 5113 + }, + { + "epoch": 0.867147096227215, + "grad_norm": 0.9777480948214932, + "learning_rate": 9.105371362591775e-07, + "loss": 0.9347, + "step": 5114 + }, + { + "epoch": 0.867316659601526, + "grad_norm": 0.9603568113276452, + "learning_rate": 9.082486968736614e-07, + "loss": 0.9142, + "step": 5115 + }, + { + "epoch": 0.8674862229758372, + "grad_norm": 0.9829871514212433, + "learning_rate": 9.05963000042378e-07, + "loss": 0.8821, + "step": 5116 + }, + { + "epoch": 0.8676557863501484, + "grad_norm": 1.029973578956744, + "learning_rate": 9.036800464548157e-07, + "loss": 0.9857, + "step": 5117 + }, + { + "epoch": 0.8678253497244596, + "grad_norm": 1.0101965327438291, + "learning_rate": 9.013998367996346e-07, + "loss": 0.9619, + "step": 5118 + }, + { + "epoch": 0.8679949130987706, + "grad_norm": 0.9583588451416015, + "learning_rate": 8.991223717646646e-07, + "loss": 0.9111, + "step": 5119 + }, + { + "epoch": 0.8681644764730818, + "grad_norm": 0.9717138900718927, + "learning_rate": 8.96847652036913e-07, + "loss": 0.8926, + "step": 5120 + }, + { + "epoch": 0.868334039847393, + "grad_norm": 0.9486978102703995, + "learning_rate": 8.945756783025528e-07, + "loss": 0.9216, + "step": 5121 + }, + { + "epoch": 0.8685036032217041, + "grad_norm": 0.9850946131914637, + "learning_rate": 8.923064512469326e-07, + "loss": 0.9203, + "step": 5122 + }, + { + "epoch": 0.8686731665960152, + "grad_norm": 0.9796159596066807, + "learning_rate": 8.900399715545715e-07, + "loss": 0.9228, + "step": 5123 + }, + { + "epoch": 0.8688427299703264, + "grad_norm": 0.9589229564601963, + "learning_rate": 8.877762399091616e-07, + "loss": 0.8848, + "step": 5124 + }, + { + "epoch": 0.8690122933446376, + "grad_norm": 0.9421268030153078, + "learning_rate": 8.855152569935632e-07, + "loss": 0.9155, + "step": 5125 + }, + { + "epoch": 0.8691818567189487, + "grad_norm": 0.9789323689720132, + "learning_rate": 8.832570234898086e-07, + "loss": 0.9476, + "step": 5126 + }, + { + "epoch": 0.8693514200932598, + "grad_norm": 0.9874458383880508, + "learning_rate": 8.810015400790994e-07, + "loss": 0.933, + "step": 5127 + }, + { + "epoch": 0.869520983467571, + "grad_norm": 0.9667220151280363, + "learning_rate": 8.787488074418116e-07, + "loss": 0.9425, + "step": 5128 + }, + { + "epoch": 0.8696905468418822, + "grad_norm": 0.9928975300421444, + "learning_rate": 8.76498826257488e-07, + "loss": 0.9369, + "step": 5129 + }, + { + "epoch": 0.8698601102161932, + "grad_norm": 0.9730032154260898, + "learning_rate": 8.742515972048404e-07, + "loss": 0.9198, + "step": 5130 + }, + { + "epoch": 0.8700296735905044, + "grad_norm": 0.9919648412287673, + "learning_rate": 8.720071209617542e-07, + "loss": 0.9111, + "step": 5131 + }, + { + "epoch": 0.8701992369648156, + "grad_norm": 0.9959727042773086, + "learning_rate": 8.697653982052834e-07, + "loss": 0.9337, + "step": 5132 + }, + { + "epoch": 0.8703688003391268, + "grad_norm": 1.1064447611436845, + "learning_rate": 8.675264296116481e-07, + "loss": 0.9342, + "step": 5133 + }, + { + "epoch": 0.8705383637134378, + "grad_norm": 0.9767851439552029, + "learning_rate": 8.652902158562382e-07, + "loss": 0.9473, + "step": 5134 + }, + { + "epoch": 0.870707927087749, + "grad_norm": 0.9338987396113887, + "learning_rate": 8.630567576136196e-07, + "loss": 0.8882, + "step": 5135 + }, + { + "epoch": 0.8708774904620602, + "grad_norm": 0.9600186868531667, + "learning_rate": 8.608260555575187e-07, + "loss": 0.9699, + "step": 5136 + }, + { + "epoch": 0.8710470538363714, + "grad_norm": 0.979677963133246, + "learning_rate": 8.585981103608343e-07, + "loss": 0.9064, + "step": 5137 + }, + { + "epoch": 0.8712166172106824, + "grad_norm": 0.9671268770018754, + "learning_rate": 8.563729226956318e-07, + "loss": 0.9273, + "step": 5138 + }, + { + "epoch": 0.8713861805849936, + "grad_norm": 0.9907340733200737, + "learning_rate": 8.541504932331468e-07, + "loss": 0.9414, + "step": 5139 + }, + { + "epoch": 0.8715557439593048, + "grad_norm": 1.0260306143590605, + "learning_rate": 8.519308226437806e-07, + "loss": 0.9356, + "step": 5140 + }, + { + "epoch": 0.871725307333616, + "grad_norm": 0.9511893600174607, + "learning_rate": 8.497139115971031e-07, + "loss": 0.8998, + "step": 5141 + }, + { + "epoch": 0.871894870707927, + "grad_norm": 0.9804482693550187, + "learning_rate": 8.474997607618551e-07, + "loss": 0.9382, + "step": 5142 + }, + { + "epoch": 0.8720644340822382, + "grad_norm": 0.9472559062781107, + "learning_rate": 8.4528837080594e-07, + "loss": 0.9288, + "step": 5143 + }, + { + "epoch": 0.8722339974565494, + "grad_norm": 0.9571216055207906, + "learning_rate": 8.43079742396431e-07, + "loss": 0.9362, + "step": 5144 + }, + { + "epoch": 0.8724035608308606, + "grad_norm": 1.0118548847309516, + "learning_rate": 8.40873876199565e-07, + "loss": 0.9261, + "step": 5145 + }, + { + "epoch": 0.8725731242051716, + "grad_norm": 0.9780578968409174, + "learning_rate": 8.386707728807509e-07, + "loss": 0.9351, + "step": 5146 + }, + { + "epoch": 0.8727426875794828, + "grad_norm": 0.9709627366549358, + "learning_rate": 8.364704331045614e-07, + "loss": 0.8931, + "step": 5147 + }, + { + "epoch": 0.872912250953794, + "grad_norm": 0.9812281645275513, + "learning_rate": 8.342728575347336e-07, + "loss": 0.9241, + "step": 5148 + }, + { + "epoch": 0.8730818143281052, + "grad_norm": 0.64539157697727, + "learning_rate": 8.320780468341761e-07, + "loss": 0.766, + "step": 5149 + }, + { + "epoch": 0.8732513777024162, + "grad_norm": 0.9848845391928729, + "learning_rate": 8.29886001664958e-07, + "loss": 0.9632, + "step": 5150 + }, + { + "epoch": 0.8734209410767274, + "grad_norm": 0.9780947487480185, + "learning_rate": 8.276967226883159e-07, + "loss": 0.8723, + "step": 5151 + }, + { + "epoch": 0.8735905044510386, + "grad_norm": 0.9598900846428129, + "learning_rate": 8.255102105646517e-07, + "loss": 0.9218, + "step": 5152 + }, + { + "epoch": 0.8737600678253498, + "grad_norm": 1.021072062223685, + "learning_rate": 8.233264659535367e-07, + "loss": 0.9441, + "step": 5153 + }, + { + "epoch": 0.8739296311996608, + "grad_norm": 0.9608923903830944, + "learning_rate": 8.211454895137027e-07, + "loss": 0.9016, + "step": 5154 + }, + { + "epoch": 0.874099194573972, + "grad_norm": 0.977958864749516, + "learning_rate": 8.189672819030459e-07, + "loss": 0.9292, + "step": 5155 + }, + { + "epoch": 0.8742687579482832, + "grad_norm": 0.9993933430233476, + "learning_rate": 8.167918437786316e-07, + "loss": 0.9779, + "step": 5156 + }, + { + "epoch": 0.8744383213225944, + "grad_norm": 0.985386215617427, + "learning_rate": 8.146191757966859e-07, + "loss": 0.9111, + "step": 5157 + }, + { + "epoch": 0.8746078846969054, + "grad_norm": 0.9521532282748394, + "learning_rate": 8.124492786126015e-07, + "loss": 0.8809, + "step": 5158 + }, + { + "epoch": 0.8747774480712166, + "grad_norm": 0.9340203567379062, + "learning_rate": 8.102821528809324e-07, + "loss": 0.9037, + "step": 5159 + }, + { + "epoch": 0.8749470114455278, + "grad_norm": 0.9798508301080222, + "learning_rate": 8.081177992554012e-07, + "loss": 0.9144, + "step": 5160 + }, + { + "epoch": 0.875116574819839, + "grad_norm": 1.00522929453304, + "learning_rate": 8.059562183888903e-07, + "loss": 0.9514, + "step": 5161 + }, + { + "epoch": 0.87528613819415, + "grad_norm": 0.9701294235974999, + "learning_rate": 8.037974109334478e-07, + "loss": 0.8929, + "step": 5162 + }, + { + "epoch": 0.8754557015684612, + "grad_norm": 0.9272003059920836, + "learning_rate": 8.016413775402832e-07, + "loss": 0.8847, + "step": 5163 + }, + { + "epoch": 0.8756252649427724, + "grad_norm": 0.9518751650123822, + "learning_rate": 7.994881188597726e-07, + "loss": 0.9088, + "step": 5164 + }, + { + "epoch": 0.8757948283170836, + "grad_norm": 1.0291401781770853, + "learning_rate": 7.97337635541452e-07, + "loss": 0.9597, + "step": 5165 + }, + { + "epoch": 0.8759643916913946, + "grad_norm": 0.958568782022767, + "learning_rate": 7.951899282340192e-07, + "loss": 0.919, + "step": 5166 + }, + { + "epoch": 0.8761339550657058, + "grad_norm": 1.065434479117011, + "learning_rate": 7.930449975853405e-07, + "loss": 0.9478, + "step": 5167 + }, + { + "epoch": 0.876303518440017, + "grad_norm": 1.0037286433260262, + "learning_rate": 7.909028442424383e-07, + "loss": 0.9597, + "step": 5168 + }, + { + "epoch": 0.8764730818143281, + "grad_norm": 1.0186036681404256, + "learning_rate": 7.887634688515e-07, + "loss": 0.9694, + "step": 5169 + }, + { + "epoch": 0.8766426451886392, + "grad_norm": 1.0100752447623533, + "learning_rate": 7.866268720578718e-07, + "loss": 0.9365, + "step": 5170 + }, + { + "epoch": 0.8768122085629504, + "grad_norm": 0.9728314290647561, + "learning_rate": 7.844930545060703e-07, + "loss": 0.9076, + "step": 5171 + }, + { + "epoch": 0.8769817719372616, + "grad_norm": 1.0345615727910933, + "learning_rate": 7.82362016839765e-07, + "loss": 0.936, + "step": 5172 + }, + { + "epoch": 0.8771513353115727, + "grad_norm": 0.999284035088074, + "learning_rate": 7.802337597017895e-07, + "loss": 0.9656, + "step": 5173 + }, + { + "epoch": 0.8773208986858838, + "grad_norm": 0.9323629990420651, + "learning_rate": 7.781082837341403e-07, + "loss": 0.9112, + "step": 5174 + }, + { + "epoch": 0.877490462060195, + "grad_norm": 1.0052745558685974, + "learning_rate": 7.759855895779711e-07, + "loss": 0.915, + "step": 5175 + }, + { + "epoch": 0.8776600254345062, + "grad_norm": 0.9368470802114017, + "learning_rate": 7.73865677873602e-07, + "loss": 0.9212, + "step": 5176 + }, + { + "epoch": 0.8778295888088173, + "grad_norm": 1.013384639361475, + "learning_rate": 7.71748549260507e-07, + "loss": 0.9257, + "step": 5177 + }, + { + "epoch": 0.8779991521831284, + "grad_norm": 0.9830292976705274, + "learning_rate": 7.696342043773297e-07, + "loss": 0.9032, + "step": 5178 + }, + { + "epoch": 0.8781687155574396, + "grad_norm": 1.0208422638851524, + "learning_rate": 7.675226438618643e-07, + "loss": 0.9308, + "step": 5179 + }, + { + "epoch": 0.8783382789317508, + "grad_norm": 1.0537077462192097, + "learning_rate": 7.654138683510715e-07, + "loss": 0.8913, + "step": 5180 + }, + { + "epoch": 0.8785078423060619, + "grad_norm": 0.9648511967871872, + "learning_rate": 7.633078784810666e-07, + "loss": 0.9242, + "step": 5181 + }, + { + "epoch": 0.878677405680373, + "grad_norm": 0.9817768262742032, + "learning_rate": 7.612046748871327e-07, + "loss": 0.9234, + "step": 5182 + }, + { + "epoch": 0.8788469690546842, + "grad_norm": 0.9904457207777576, + "learning_rate": 7.591042582037055e-07, + "loss": 0.9616, + "step": 5183 + }, + { + "epoch": 0.8790165324289954, + "grad_norm": 0.9628743537930682, + "learning_rate": 7.570066290643784e-07, + "loss": 0.868, + "step": 5184 + }, + { + "epoch": 0.8791860958033065, + "grad_norm": 0.9744800234184157, + "learning_rate": 7.549117881019141e-07, + "loss": 0.9362, + "step": 5185 + }, + { + "epoch": 0.8793556591776176, + "grad_norm": 0.9477881467648516, + "learning_rate": 7.528197359482237e-07, + "loss": 0.8884, + "step": 5186 + }, + { + "epoch": 0.8795252225519288, + "grad_norm": 0.977958023644168, + "learning_rate": 7.507304732343823e-07, + "loss": 0.9039, + "step": 5187 + }, + { + "epoch": 0.87969478592624, + "grad_norm": 0.9934082532115938, + "learning_rate": 7.486440005906193e-07, + "loss": 0.8932, + "step": 5188 + }, + { + "epoch": 0.8798643493005511, + "grad_norm": 0.9913470365558636, + "learning_rate": 7.465603186463299e-07, + "loss": 0.9268, + "step": 5189 + }, + { + "epoch": 0.8800339126748622, + "grad_norm": 1.1338060063614095, + "learning_rate": 7.444794280300605e-07, + "loss": 0.9099, + "step": 5190 + }, + { + "epoch": 0.8802034760491734, + "grad_norm": 1.016483344725631, + "learning_rate": 7.424013293695199e-07, + "loss": 0.9671, + "step": 5191 + }, + { + "epoch": 0.8803730394234845, + "grad_norm": 0.9714348803599815, + "learning_rate": 7.40326023291571e-07, + "loss": 0.9025, + "step": 5192 + }, + { + "epoch": 0.8805426027977957, + "grad_norm": 1.0242147478811012, + "learning_rate": 7.382535104222366e-07, + "loss": 0.925, + "step": 5193 + }, + { + "epoch": 0.8807121661721068, + "grad_norm": 1.004007701099195, + "learning_rate": 7.361837913866965e-07, + "loss": 0.96, + "step": 5194 + }, + { + "epoch": 0.880881729546418, + "grad_norm": 0.9253379131107716, + "learning_rate": 7.341168668092857e-07, + "loss": 0.9227, + "step": 5195 + }, + { + "epoch": 0.8810512929207291, + "grad_norm": 0.956403367681735, + "learning_rate": 7.320527373135023e-07, + "loss": 0.8884, + "step": 5196 + }, + { + "epoch": 0.8812208562950403, + "grad_norm": 0.9716111779630627, + "learning_rate": 7.299914035219957e-07, + "loss": 0.9487, + "step": 5197 + }, + { + "epoch": 0.8813904196693514, + "grad_norm": 0.951417939782792, + "learning_rate": 7.279328660565721e-07, + "loss": 0.9048, + "step": 5198 + }, + { + "epoch": 0.8815599830436626, + "grad_norm": 0.9894361274107053, + "learning_rate": 7.258771255381947e-07, + "loss": 0.9306, + "step": 5199 + }, + { + "epoch": 0.8817295464179737, + "grad_norm": 0.9617427817825863, + "learning_rate": 7.238241825869885e-07, + "loss": 0.9272, + "step": 5200 + }, + { + "epoch": 0.8818991097922849, + "grad_norm": 1.022314754342777, + "learning_rate": 7.21774037822226e-07, + "loss": 0.9343, + "step": 5201 + }, + { + "epoch": 0.882068673166596, + "grad_norm": 0.962593647347168, + "learning_rate": 7.197266918623392e-07, + "loss": 0.921, + "step": 5202 + }, + { + "epoch": 0.8822382365409072, + "grad_norm": 0.9598241917809326, + "learning_rate": 7.176821453249183e-07, + "loss": 0.9348, + "step": 5203 + }, + { + "epoch": 0.8824077999152183, + "grad_norm": 0.9780403951496743, + "learning_rate": 7.156403988267069e-07, + "loss": 0.9336, + "step": 5204 + }, + { + "epoch": 0.8825773632895295, + "grad_norm": 0.9685610535137948, + "learning_rate": 7.136014529836033e-07, + "loss": 0.9473, + "step": 5205 + }, + { + "epoch": 0.8827469266638406, + "grad_norm": 0.9628777695022261, + "learning_rate": 7.115653084106599e-07, + "loss": 0.881, + "step": 5206 + }, + { + "epoch": 0.8829164900381518, + "grad_norm": 0.996109948351507, + "learning_rate": 7.09531965722089e-07, + "loss": 0.9526, + "step": 5207 + }, + { + "epoch": 0.8830860534124629, + "grad_norm": 0.9823749419932903, + "learning_rate": 7.07501425531254e-07, + "loss": 0.9122, + "step": 5208 + }, + { + "epoch": 0.8832556167867741, + "grad_norm": 1.001868883864078, + "learning_rate": 7.054736884506718e-07, + "loss": 0.9054, + "step": 5209 + }, + { + "epoch": 0.8834251801610852, + "grad_norm": 0.9685851510283865, + "learning_rate": 7.034487550920166e-07, + "loss": 0.9434, + "step": 5210 + }, + { + "epoch": 0.8835947435353964, + "grad_norm": 1.0012563259655243, + "learning_rate": 7.014266260661151e-07, + "loss": 0.939, + "step": 5211 + }, + { + "epoch": 0.8837643069097075, + "grad_norm": 0.9514074295961853, + "learning_rate": 6.994073019829483e-07, + "loss": 0.8785, + "step": 5212 + }, + { + "epoch": 0.8839338702840186, + "grad_norm": 1.0073062747137473, + "learning_rate": 6.973907834516513e-07, + "loss": 0.9067, + "step": 5213 + }, + { + "epoch": 0.8841034336583298, + "grad_norm": 0.9832678015341291, + "learning_rate": 6.953770710805141e-07, + "loss": 0.9214, + "step": 5214 + }, + { + "epoch": 0.884272997032641, + "grad_norm": 0.9755433882119658, + "learning_rate": 6.933661654769797e-07, + "loss": 0.9342, + "step": 5215 + }, + { + "epoch": 0.8844425604069521, + "grad_norm": 0.9276341405930084, + "learning_rate": 6.913580672476428e-07, + "loss": 0.8886, + "step": 5216 + }, + { + "epoch": 0.8846121237812632, + "grad_norm": 1.0060530108779318, + "learning_rate": 6.8935277699825e-07, + "loss": 0.9173, + "step": 5217 + }, + { + "epoch": 0.8847816871555744, + "grad_norm": 0.9856993523135025, + "learning_rate": 6.873502953337075e-07, + "loss": 0.9448, + "step": 5218 + }, + { + "epoch": 0.8849512505298855, + "grad_norm": 0.9539331742318515, + "learning_rate": 6.853506228580675e-07, + "loss": 0.9397, + "step": 5219 + }, + { + "epoch": 0.8851208139041967, + "grad_norm": 1.0063102143893765, + "learning_rate": 6.833537601745366e-07, + "loss": 0.9473, + "step": 5220 + }, + { + "epoch": 0.8852903772785078, + "grad_norm": 0.9765543547723605, + "learning_rate": 6.813597078854772e-07, + "loss": 0.9238, + "step": 5221 + }, + { + "epoch": 0.885459940652819, + "grad_norm": 0.9780907928921038, + "learning_rate": 6.793684665923983e-07, + "loss": 0.9256, + "step": 5222 + }, + { + "epoch": 0.8856295040271301, + "grad_norm": 0.9878822235226608, + "learning_rate": 6.77380036895966e-07, + "loss": 0.936, + "step": 5223 + }, + { + "epoch": 0.8857990674014413, + "grad_norm": 0.9504494205405626, + "learning_rate": 6.753944193959938e-07, + "loss": 0.8783, + "step": 5224 + }, + { + "epoch": 0.8859686307757524, + "grad_norm": 0.9790420113619818, + "learning_rate": 6.734116146914516e-07, + "loss": 0.909, + "step": 5225 + }, + { + "epoch": 0.8861381941500636, + "grad_norm": 0.973751204979467, + "learning_rate": 6.714316233804574e-07, + "loss": 0.9311, + "step": 5226 + }, + { + "epoch": 0.8863077575243747, + "grad_norm": 0.9795812048593907, + "learning_rate": 6.694544460602825e-07, + "loss": 0.9186, + "step": 5227 + }, + { + "epoch": 0.8864773208986859, + "grad_norm": 1.0096764097554634, + "learning_rate": 6.674800833273465e-07, + "loss": 0.9566, + "step": 5228 + }, + { + "epoch": 0.886646884272997, + "grad_norm": 0.9637532340329327, + "learning_rate": 6.655085357772229e-07, + "loss": 0.9148, + "step": 5229 + }, + { + "epoch": 0.8868164476473082, + "grad_norm": 1.016729573317816, + "learning_rate": 6.635398040046348e-07, + "loss": 0.9572, + "step": 5230 + }, + { + "epoch": 0.8869860110216193, + "grad_norm": 0.9922862214584582, + "learning_rate": 6.615738886034551e-07, + "loss": 0.9217, + "step": 5231 + }, + { + "epoch": 0.8871555743959305, + "grad_norm": 0.641080241759164, + "learning_rate": 6.596107901667103e-07, + "loss": 0.7823, + "step": 5232 + }, + { + "epoch": 0.8873251377702416, + "grad_norm": 0.6028825955841568, + "learning_rate": 6.576505092865748e-07, + "loss": 0.7256, + "step": 5233 + }, + { + "epoch": 0.8874947011445528, + "grad_norm": 1.024517253575578, + "learning_rate": 6.556930465543709e-07, + "loss": 0.9343, + "step": 5234 + }, + { + "epoch": 0.8876642645188639, + "grad_norm": 1.0159189030447726, + "learning_rate": 6.537384025605742e-07, + "loss": 0.9337, + "step": 5235 + }, + { + "epoch": 0.8878338278931751, + "grad_norm": 0.954881906372445, + "learning_rate": 6.517865778948108e-07, + "loss": 0.9197, + "step": 5236 + }, + { + "epoch": 0.8880033912674862, + "grad_norm": 0.9641775202736786, + "learning_rate": 6.498375731458529e-07, + "loss": 0.9222, + "step": 5237 + }, + { + "epoch": 0.8881729546417974, + "grad_norm": 0.9943370339371903, + "learning_rate": 6.478913889016214e-07, + "loss": 0.9284, + "step": 5238 + }, + { + "epoch": 0.8883425180161085, + "grad_norm": 1.0166260018741513, + "learning_rate": 6.459480257491935e-07, + "loss": 0.9368, + "step": 5239 + }, + { + "epoch": 0.8885120813904197, + "grad_norm": 0.9602770168078034, + "learning_rate": 6.440074842747879e-07, + "loss": 0.9622, + "step": 5240 + }, + { + "epoch": 0.8886816447647308, + "grad_norm": 1.011185471489416, + "learning_rate": 6.420697650637753e-07, + "loss": 0.9432, + "step": 5241 + }, + { + "epoch": 0.888851208139042, + "grad_norm": 0.9692960221110054, + "learning_rate": 6.401348687006725e-07, + "loss": 0.9108, + "step": 5242 + }, + { + "epoch": 0.8890207715133531, + "grad_norm": 0.9768566773025212, + "learning_rate": 6.382027957691506e-07, + "loss": 0.9209, + "step": 5243 + }, + { + "epoch": 0.8891903348876643, + "grad_norm": 0.9360427922861517, + "learning_rate": 6.362735468520232e-07, + "loss": 0.8772, + "step": 5244 + }, + { + "epoch": 0.8893598982619754, + "grad_norm": 1.012193441894449, + "learning_rate": 6.343471225312536e-07, + "loss": 0.9382, + "step": 5245 + }, + { + "epoch": 0.8895294616362865, + "grad_norm": 0.9683443376292931, + "learning_rate": 6.324235233879539e-07, + "loss": 0.9225, + "step": 5246 + }, + { + "epoch": 0.8896990250105977, + "grad_norm": 0.9850483600082118, + "learning_rate": 6.305027500023841e-07, + "loss": 0.9136, + "step": 5247 + }, + { + "epoch": 0.8898685883849089, + "grad_norm": 0.6518534506406859, + "learning_rate": 6.28584802953951e-07, + "loss": 0.7962, + "step": 5248 + }, + { + "epoch": 0.89003815175922, + "grad_norm": 0.9831450716270983, + "learning_rate": 6.266696828212071e-07, + "loss": 0.922, + "step": 5249 + }, + { + "epoch": 0.8902077151335311, + "grad_norm": 0.9821689684049345, + "learning_rate": 6.247573901818571e-07, + "loss": 0.9236, + "step": 5250 + }, + { + "epoch": 0.8903772785078423, + "grad_norm": 0.9812600102435868, + "learning_rate": 6.228479256127495e-07, + "loss": 0.95, + "step": 5251 + }, + { + "epoch": 0.8905468418821535, + "grad_norm": 0.9570220059341775, + "learning_rate": 6.209412896898792e-07, + "loss": 0.9075, + "step": 5252 + }, + { + "epoch": 0.8907164052564646, + "grad_norm": 0.6327268333559012, + "learning_rate": 6.190374829883883e-07, + "loss": 0.7797, + "step": 5253 + }, + { + "epoch": 0.8908859686307757, + "grad_norm": 0.8919280776056981, + "learning_rate": 6.171365060825674e-07, + "loss": 0.8814, + "step": 5254 + }, + { + "epoch": 0.8910555320050869, + "grad_norm": 0.9818274474964976, + "learning_rate": 6.1523835954585e-07, + "loss": 0.9506, + "step": 5255 + }, + { + "epoch": 0.8912250953793981, + "grad_norm": 1.003481521128498, + "learning_rate": 6.133430439508181e-07, + "loss": 0.9569, + "step": 5256 + }, + { + "epoch": 0.8913946587537092, + "grad_norm": 0.9938198852653174, + "learning_rate": 6.114505598692011e-07, + "loss": 0.9067, + "step": 5257 + }, + { + "epoch": 0.8915642221280203, + "grad_norm": 0.9710700454374221, + "learning_rate": 6.095609078718712e-07, + "loss": 0.928, + "step": 5258 + }, + { + "epoch": 0.8917337855023315, + "grad_norm": 1.015557959628636, + "learning_rate": 6.076740885288479e-07, + "loss": 0.9796, + "step": 5259 + }, + { + "epoch": 0.8919033488766427, + "grad_norm": 0.992793079506282, + "learning_rate": 6.057901024092949e-07, + "loss": 0.9007, + "step": 5260 + }, + { + "epoch": 0.8920729122509538, + "grad_norm": 0.9668869791624148, + "learning_rate": 6.039089500815243e-07, + "loss": 0.917, + "step": 5261 + }, + { + "epoch": 0.8922424756252649, + "grad_norm": 1.017636019801052, + "learning_rate": 6.02030632112991e-07, + "loss": 0.9146, + "step": 5262 + }, + { + "epoch": 0.8924120389995761, + "grad_norm": 0.9179743512419041, + "learning_rate": 6.001551490702939e-07, + "loss": 0.9005, + "step": 5263 + }, + { + "epoch": 0.8925816023738873, + "grad_norm": 1.0086567042742047, + "learning_rate": 5.982825015191785e-07, + "loss": 0.9155, + "step": 5264 + }, + { + "epoch": 0.8927511657481983, + "grad_norm": 0.9323295563038754, + "learning_rate": 5.964126900245359e-07, + "loss": 0.8926, + "step": 5265 + }, + { + "epoch": 0.8929207291225095, + "grad_norm": 0.9946118197879867, + "learning_rate": 5.945457151503986e-07, + "loss": 0.8892, + "step": 5266 + }, + { + "epoch": 0.8930902924968207, + "grad_norm": 0.9708677222069694, + "learning_rate": 5.926815774599449e-07, + "loss": 0.9008, + "step": 5267 + }, + { + "epoch": 0.8932598558711319, + "grad_norm": 0.958213429980109, + "learning_rate": 5.908202775155003e-07, + "loss": 0.9204, + "step": 5268 + }, + { + "epoch": 0.893429419245443, + "grad_norm": 1.0216725181792323, + "learning_rate": 5.889618158785304e-07, + "loss": 0.9326, + "step": 5269 + }, + { + "epoch": 0.8935989826197541, + "grad_norm": 0.952656794617966, + "learning_rate": 5.871061931096445e-07, + "loss": 0.9319, + "step": 5270 + }, + { + "epoch": 0.8937685459940653, + "grad_norm": 0.961869827435826, + "learning_rate": 5.852534097685958e-07, + "loss": 0.9364, + "step": 5271 + }, + { + "epoch": 0.8939381093683765, + "grad_norm": 0.9528223691943507, + "learning_rate": 5.834034664142862e-07, + "loss": 0.9051, + "step": 5272 + }, + { + "epoch": 0.8941076727426875, + "grad_norm": 1.0023317416333115, + "learning_rate": 5.815563636047539e-07, + "loss": 0.9245, + "step": 5273 + }, + { + "epoch": 0.8942772361169987, + "grad_norm": 0.9532275624838807, + "learning_rate": 5.797121018971818e-07, + "loss": 0.9126, + "step": 5274 + }, + { + "epoch": 0.8944467994913099, + "grad_norm": 1.0021281624248375, + "learning_rate": 5.778706818479007e-07, + "loss": 0.954, + "step": 5275 + }, + { + "epoch": 0.8946163628656211, + "grad_norm": 0.6564871789560096, + "learning_rate": 5.760321040123784e-07, + "loss": 0.7657, + "step": 5276 + }, + { + "epoch": 0.8947859262399321, + "grad_norm": 1.023635057461732, + "learning_rate": 5.741963689452268e-07, + "loss": 0.9213, + "step": 5277 + }, + { + "epoch": 0.8949554896142433, + "grad_norm": 1.0119325135444848, + "learning_rate": 5.723634772002007e-07, + "loss": 0.9381, + "step": 5278 + }, + { + "epoch": 0.8951250529885545, + "grad_norm": 0.9635630471053847, + "learning_rate": 5.705334293302e-07, + "loss": 0.9503, + "step": 5279 + }, + { + "epoch": 0.8952946163628657, + "grad_norm": 1.0764490539676979, + "learning_rate": 5.687062258872622e-07, + "loss": 0.917, + "step": 5280 + }, + { + "epoch": 0.8954641797371767, + "grad_norm": 1.002630091769928, + "learning_rate": 5.668818674225684e-07, + "loss": 0.9354, + "step": 5281 + }, + { + "epoch": 0.8956337431114879, + "grad_norm": 0.9516109984492871, + "learning_rate": 5.65060354486443e-07, + "loss": 0.933, + "step": 5282 + }, + { + "epoch": 0.8958033064857991, + "grad_norm": 1.036804485805387, + "learning_rate": 5.632416876283508e-07, + "loss": 0.933, + "step": 5283 + }, + { + "epoch": 0.8959728698601103, + "grad_norm": 0.9632250556881334, + "learning_rate": 5.614258673968976e-07, + "loss": 0.9326, + "step": 5284 + }, + { + "epoch": 0.8961424332344213, + "grad_norm": 0.9536898853698114, + "learning_rate": 5.596128943398316e-07, + "loss": 0.9307, + "step": 5285 + }, + { + "epoch": 0.8963119966087325, + "grad_norm": 1.0056816957816208, + "learning_rate": 5.578027690040411e-07, + "loss": 0.9385, + "step": 5286 + }, + { + "epoch": 0.8964815599830437, + "grad_norm": 0.9543181101782496, + "learning_rate": 5.559954919355559e-07, + "loss": 0.9253, + "step": 5287 + }, + { + "epoch": 0.8966511233573549, + "grad_norm": 0.9569725968159479, + "learning_rate": 5.541910636795455e-07, + "loss": 0.8771, + "step": 5288 + }, + { + "epoch": 0.8968206867316659, + "grad_norm": 0.9639028215842664, + "learning_rate": 5.523894847803235e-07, + "loss": 0.9487, + "step": 5289 + }, + { + "epoch": 0.8969902501059771, + "grad_norm": 0.9527339518093475, + "learning_rate": 5.505907557813395e-07, + "loss": 0.9363, + "step": 5290 + }, + { + "epoch": 0.8971598134802883, + "grad_norm": 0.9820282129080463, + "learning_rate": 5.487948772251872e-07, + "loss": 0.9138, + "step": 5291 + }, + { + "epoch": 0.8973293768545995, + "grad_norm": 0.9451089806125501, + "learning_rate": 5.470018496535967e-07, + "loss": 0.9094, + "step": 5292 + }, + { + "epoch": 0.8974989402289105, + "grad_norm": 0.9696214107428724, + "learning_rate": 5.452116736074431e-07, + "loss": 0.921, + "step": 5293 + }, + { + "epoch": 0.8976685036032217, + "grad_norm": 0.9836080700131032, + "learning_rate": 5.434243496267366e-07, + "loss": 0.9224, + "step": 5294 + }, + { + "epoch": 0.8978380669775329, + "grad_norm": 0.9570644176853008, + "learning_rate": 5.416398782506294e-07, + "loss": 0.8912, + "step": 5295 + }, + { + "epoch": 0.898007630351844, + "grad_norm": 0.9692392647198594, + "learning_rate": 5.398582600174107e-07, + "loss": 0.8828, + "step": 5296 + }, + { + "epoch": 0.8981771937261551, + "grad_norm": 0.932679314652526, + "learning_rate": 5.380794954645141e-07, + "loss": 0.9036, + "step": 5297 + }, + { + "epoch": 0.8983467571004663, + "grad_norm": 0.958945221366242, + "learning_rate": 5.363035851285081e-07, + "loss": 0.9262, + "step": 5298 + }, + { + "epoch": 0.8985163204747775, + "grad_norm": 1.054896570874422, + "learning_rate": 5.345305295450997e-07, + "loss": 0.9603, + "step": 5299 + }, + { + "epoch": 0.8986858838490887, + "grad_norm": 0.9853113013458369, + "learning_rate": 5.32760329249139e-07, + "loss": 0.9173, + "step": 5300 + }, + { + "epoch": 0.8988554472233997, + "grad_norm": 0.9721358648701095, + "learning_rate": 5.30992984774612e-07, + "loss": 0.95, + "step": 5301 + }, + { + "epoch": 0.8990250105977109, + "grad_norm": 0.9728220144867263, + "learning_rate": 5.292284966546424e-07, + "loss": 0.8987, + "step": 5302 + }, + { + "epoch": 0.8991945739720221, + "grad_norm": 0.9387027892380185, + "learning_rate": 5.274668654214931e-07, + "loss": 0.9396, + "step": 5303 + }, + { + "epoch": 0.8993641373463331, + "grad_norm": 0.9696021367192991, + "learning_rate": 5.257080916065671e-07, + "loss": 0.9732, + "step": 5304 + }, + { + "epoch": 0.8995337007206443, + "grad_norm": 0.9995815736940336, + "learning_rate": 5.23952175740402e-07, + "loss": 0.9208, + "step": 5305 + }, + { + "epoch": 0.8997032640949555, + "grad_norm": 0.9381786629999485, + "learning_rate": 5.221991183526753e-07, + "loss": 0.9574, + "step": 5306 + }, + { + "epoch": 0.8998728274692667, + "grad_norm": 0.6804674425166404, + "learning_rate": 5.20448919972204e-07, + "loss": 0.7873, + "step": 5307 + }, + { + "epoch": 0.9000423908435777, + "grad_norm": 0.9255734150491965, + "learning_rate": 5.187015811269391e-07, + "loss": 0.8995, + "step": 5308 + }, + { + "epoch": 0.9002119542178889, + "grad_norm": 0.9262934754730586, + "learning_rate": 5.169571023439712e-07, + "loss": 0.8953, + "step": 5309 + }, + { + "epoch": 0.9003815175922001, + "grad_norm": 0.9726524956698935, + "learning_rate": 5.152154841495249e-07, + "loss": 0.9565, + "step": 5310 + }, + { + "epoch": 0.9005510809665113, + "grad_norm": 0.9686538984864316, + "learning_rate": 5.1347672706897e-07, + "loss": 0.9071, + "step": 5311 + }, + { + "epoch": 0.9007206443408223, + "grad_norm": 0.9461515617358507, + "learning_rate": 5.117408316268047e-07, + "loss": 0.8926, + "step": 5312 + }, + { + "epoch": 0.9008902077151335, + "grad_norm": 0.9679127852350202, + "learning_rate": 5.100077983466667e-07, + "loss": 0.9066, + "step": 5313 + }, + { + "epoch": 0.9010597710894447, + "grad_norm": 0.9769173863195825, + "learning_rate": 5.08277627751329e-07, + "loss": 0.9172, + "step": 5314 + }, + { + "epoch": 0.9012293344637559, + "grad_norm": 0.9313000820685344, + "learning_rate": 5.065503203627076e-07, + "loss": 0.8606, + "step": 5315 + }, + { + "epoch": 0.9013988978380669, + "grad_norm": 0.9900429787673576, + "learning_rate": 5.048258767018477e-07, + "loss": 0.9348, + "step": 5316 + }, + { + "epoch": 0.9015684612123781, + "grad_norm": 0.6569017524250269, + "learning_rate": 5.031042972889311e-07, + "loss": 0.7979, + "step": 5317 + }, + { + "epoch": 0.9017380245866893, + "grad_norm": 0.9831181767384738, + "learning_rate": 5.013855826432801e-07, + "loss": 0.9077, + "step": 5318 + }, + { + "epoch": 0.9019075879610005, + "grad_norm": 0.9513029618092855, + "learning_rate": 4.996697332833489e-07, + "loss": 0.9179, + "step": 5319 + }, + { + "epoch": 0.9020771513353115, + "grad_norm": 1.0150485130948095, + "learning_rate": 4.979567497267302e-07, + "loss": 0.9319, + "step": 5320 + }, + { + "epoch": 0.9022467147096227, + "grad_norm": 1.0123890516368361, + "learning_rate": 4.962466324901483e-07, + "loss": 0.9312, + "step": 5321 + }, + { + "epoch": 0.9024162780839339, + "grad_norm": 0.9879395724300434, + "learning_rate": 4.945393820894662e-07, + "loss": 0.885, + "step": 5322 + }, + { + "epoch": 0.902585841458245, + "grad_norm": 0.9606900271195277, + "learning_rate": 4.928349990396808e-07, + "loss": 0.9675, + "step": 5323 + }, + { + "epoch": 0.9027554048325561, + "grad_norm": 0.9767697787127416, + "learning_rate": 4.911334838549242e-07, + "loss": 0.9142, + "step": 5324 + }, + { + "epoch": 0.9029249682068673, + "grad_norm": 0.9696775612775178, + "learning_rate": 4.894348370484648e-07, + "loss": 0.9083, + "step": 5325 + }, + { + "epoch": 0.9030945315811785, + "grad_norm": 0.9440806168822182, + "learning_rate": 4.877390591327036e-07, + "loss": 0.9074, + "step": 5326 + }, + { + "epoch": 0.9032640949554896, + "grad_norm": 1.051395713264764, + "learning_rate": 4.860461506191782e-07, + "loss": 0.9537, + "step": 5327 + }, + { + "epoch": 0.9034336583298007, + "grad_norm": 0.9842090616624319, + "learning_rate": 4.843561120185581e-07, + "loss": 0.8999, + "step": 5328 + }, + { + "epoch": 0.9036032217041119, + "grad_norm": 1.019286767442933, + "learning_rate": 4.826689438406495e-07, + "loss": 0.9914, + "step": 5329 + }, + { + "epoch": 0.9037727850784231, + "grad_norm": 0.9975617622676345, + "learning_rate": 4.809846465943912e-07, + "loss": 0.9124, + "step": 5330 + }, + { + "epoch": 0.9039423484527342, + "grad_norm": 0.9266459039272539, + "learning_rate": 4.793032207878579e-07, + "loss": 0.9105, + "step": 5331 + }, + { + "epoch": 0.9041119118270453, + "grad_norm": 0.9778496315796683, + "learning_rate": 4.776246669282536e-07, + "loss": 0.9309, + "step": 5332 + }, + { + "epoch": 0.9042814752013565, + "grad_norm": 0.9504848514653697, + "learning_rate": 4.759489855219235e-07, + "loss": 0.9086, + "step": 5333 + }, + { + "epoch": 0.9044510385756677, + "grad_norm": 0.9475976851297315, + "learning_rate": 4.742761770743387e-07, + "loss": 0.902, + "step": 5334 + }, + { + "epoch": 0.9046206019499788, + "grad_norm": 0.9674882792684568, + "learning_rate": 4.7260624209010675e-07, + "loss": 0.9006, + "step": 5335 + }, + { + "epoch": 0.9047901653242899, + "grad_norm": 1.0190261397888378, + "learning_rate": 4.709391810729713e-07, + "loss": 0.9133, + "step": 5336 + }, + { + "epoch": 0.9049597286986011, + "grad_norm": 0.9767734090303771, + "learning_rate": 4.6927499452580574e-07, + "loss": 0.917, + "step": 5337 + }, + { + "epoch": 0.9051292920729123, + "grad_norm": 0.9555990305750007, + "learning_rate": 4.67613682950615e-07, + "loss": 0.9173, + "step": 5338 + }, + { + "epoch": 0.9052988554472234, + "grad_norm": 1.0132778736031969, + "learning_rate": 4.6595524684853954e-07, + "loss": 0.9041, + "step": 5339 + }, + { + "epoch": 0.9054684188215345, + "grad_norm": 1.0073560455047579, + "learning_rate": 4.6429968671985235e-07, + "loss": 0.9199, + "step": 5340 + }, + { + "epoch": 0.9056379821958457, + "grad_norm": 0.6131839902783626, + "learning_rate": 4.626470030639574e-07, + "loss": 0.773, + "step": 5341 + }, + { + "epoch": 0.9058075455701569, + "grad_norm": 1.0051760144127484, + "learning_rate": 4.6099719637939136e-07, + "loss": 0.9321, + "step": 5342 + }, + { + "epoch": 0.905977108944468, + "grad_norm": 0.9850913808183326, + "learning_rate": 4.59350267163825e-07, + "loss": 0.929, + "step": 5343 + }, + { + "epoch": 0.9061466723187791, + "grad_norm": 1.0481881351182123, + "learning_rate": 4.5770621591405773e-07, + "loss": 0.933, + "step": 5344 + }, + { + "epoch": 0.9063162356930903, + "grad_norm": 0.965853837730131, + "learning_rate": 4.5606504312602384e-07, + "loss": 0.968, + "step": 5345 + }, + { + "epoch": 0.9064857990674015, + "grad_norm": 0.9454795167133183, + "learning_rate": 4.5442674929478625e-07, + "loss": 0.9332, + "step": 5346 + }, + { + "epoch": 0.9066553624417126, + "grad_norm": 0.9609462911927776, + "learning_rate": 4.5279133491454406e-07, + "loss": 0.9122, + "step": 5347 + }, + { + "epoch": 0.9068249258160237, + "grad_norm": 0.9627821897110885, + "learning_rate": 4.511588004786227e-07, + "loss": 0.9508, + "step": 5348 + }, + { + "epoch": 0.9069944891903349, + "grad_norm": 0.9464531931770697, + "learning_rate": 4.4952914647948264e-07, + "loss": 0.8912, + "step": 5349 + }, + { + "epoch": 0.907164052564646, + "grad_norm": 0.9563311701679522, + "learning_rate": 4.479023734087118e-07, + "loss": 0.9263, + "step": 5350 + }, + { + "epoch": 0.9073336159389572, + "grad_norm": 0.9838140660453529, + "learning_rate": 4.4627848175703315e-07, + "loss": 0.9418, + "step": 5351 + }, + { + "epoch": 0.9075031793132683, + "grad_norm": 0.9551869502004656, + "learning_rate": 4.4465747201429823e-07, + "loss": 0.9063, + "step": 5352 + }, + { + "epoch": 0.9076727426875795, + "grad_norm": 1.0141068195639051, + "learning_rate": 4.4303934466948804e-07, + "loss": 0.9384, + "step": 5353 + }, + { + "epoch": 0.9078423060618906, + "grad_norm": 0.9530027881094659, + "learning_rate": 4.414241002107178e-07, + "loss": 0.9125, + "step": 5354 + }, + { + "epoch": 0.9080118694362018, + "grad_norm": 0.98425247386984, + "learning_rate": 4.398117391252299e-07, + "loss": 0.9186, + "step": 5355 + }, + { + "epoch": 0.9081814328105129, + "grad_norm": 0.9619651161774531, + "learning_rate": 4.382022618993975e-07, + "loss": 0.9173, + "step": 5356 + }, + { + "epoch": 0.9083509961848241, + "grad_norm": 0.9264821942169333, + "learning_rate": 4.365956690187256e-07, + "loss": 0.9443, + "step": 5357 + }, + { + "epoch": 0.9085205595591352, + "grad_norm": 0.9917467694505719, + "learning_rate": 4.3499196096784544e-07, + "loss": 0.9366, + "step": 5358 + }, + { + "epoch": 0.9086901229334464, + "grad_norm": 0.9629837102454752, + "learning_rate": 4.3339113823052223e-07, + "loss": 0.8951, + "step": 5359 + }, + { + "epoch": 0.9088596863077575, + "grad_norm": 0.9456688365160955, + "learning_rate": 4.317932012896475e-07, + "loss": 0.9103, + "step": 5360 + }, + { + "epoch": 0.9090292496820687, + "grad_norm": 0.9866153383422445, + "learning_rate": 4.3019815062724567e-07, + "loss": 0.9458, + "step": 5361 + }, + { + "epoch": 0.9091988130563798, + "grad_norm": 0.9528917367648084, + "learning_rate": 4.286059867244685e-07, + "loss": 0.8758, + "step": 5362 + }, + { + "epoch": 0.909368376430691, + "grad_norm": 1.0479345752413354, + "learning_rate": 4.270167100615952e-07, + "loss": 0.9488, + "step": 5363 + }, + { + "epoch": 0.9095379398050021, + "grad_norm": 0.9412771103362266, + "learning_rate": 4.254303211180355e-07, + "loss": 0.8842, + "step": 5364 + }, + { + "epoch": 0.9097075031793133, + "grad_norm": 1.0014740399517312, + "learning_rate": 4.2384682037233115e-07, + "loss": 0.9197, + "step": 5365 + }, + { + "epoch": 0.9098770665536244, + "grad_norm": 0.9763866678925637, + "learning_rate": 4.222662083021489e-07, + "loss": 0.9296, + "step": 5366 + }, + { + "epoch": 0.9100466299279356, + "grad_norm": 1.0040310918329611, + "learning_rate": 4.206884853842852e-07, + "loss": 0.8908, + "step": 5367 + }, + { + "epoch": 0.9102161933022467, + "grad_norm": 0.9576255730098154, + "learning_rate": 4.191136520946626e-07, + "loss": 0.9385, + "step": 5368 + }, + { + "epoch": 0.9103857566765579, + "grad_norm": 1.0055933207058636, + "learning_rate": 4.1754170890833777e-07, + "loss": 0.9843, + "step": 5369 + }, + { + "epoch": 0.910555320050869, + "grad_norm": 0.9973837267287784, + "learning_rate": 4.1597265629949146e-07, + "loss": 0.9359, + "step": 5370 + }, + { + "epoch": 0.9107248834251802, + "grad_norm": 0.9466030246624016, + "learning_rate": 4.144064947414295e-07, + "loss": 0.9063, + "step": 5371 + }, + { + "epoch": 0.9108944467994913, + "grad_norm": 0.9215907355186325, + "learning_rate": 4.1284322470659386e-07, + "loss": 0.9104, + "step": 5372 + }, + { + "epoch": 0.9110640101738025, + "grad_norm": 0.963263187659054, + "learning_rate": 4.112828466665486e-07, + "loss": 0.9077, + "step": 5373 + }, + { + "epoch": 0.9112335735481136, + "grad_norm": 0.988887337624277, + "learning_rate": 4.0972536109198493e-07, + "loss": 0.9613, + "step": 5374 + }, + { + "epoch": 0.9114031369224248, + "grad_norm": 0.9716510077462397, + "learning_rate": 4.081707684527236e-07, + "loss": 0.9356, + "step": 5375 + }, + { + "epoch": 0.9115727002967359, + "grad_norm": 0.9434431025953428, + "learning_rate": 4.0661906921771297e-07, + "loss": 0.9504, + "step": 5376 + }, + { + "epoch": 0.911742263671047, + "grad_norm": 0.9087093935147786, + "learning_rate": 4.0507026385502747e-07, + "loss": 0.8905, + "step": 5377 + }, + { + "epoch": 0.9119118270453582, + "grad_norm": 0.9289058127702048, + "learning_rate": 4.035243528318666e-07, + "loss": 0.9064, + "step": 5378 + }, + { + "epoch": 0.9120813904196694, + "grad_norm": 0.9306911482992167, + "learning_rate": 4.019813366145631e-07, + "loss": 0.9247, + "step": 5379 + }, + { + "epoch": 0.9122509537939805, + "grad_norm": 0.9627105141700903, + "learning_rate": 4.0044121566857106e-07, + "loss": 0.8919, + "step": 5380 + }, + { + "epoch": 0.9124205171682916, + "grad_norm": 1.033443833616269, + "learning_rate": 3.9890399045847127e-07, + "loss": 0.9329, + "step": 5381 + }, + { + "epoch": 0.9125900805426028, + "grad_norm": 0.931559687121707, + "learning_rate": 3.9736966144797164e-07, + "loss": 0.9159, + "step": 5382 + }, + { + "epoch": 0.912759643916914, + "grad_norm": 0.9763002306429589, + "learning_rate": 3.958382290999108e-07, + "loss": 0.9459, + "step": 5383 + }, + { + "epoch": 0.9129292072912251, + "grad_norm": 1.0190309312128152, + "learning_rate": 3.9430969387624694e-07, + "loss": 0.925, + "step": 5384 + }, + { + "epoch": 0.9130987706655362, + "grad_norm": 0.9589514814186912, + "learning_rate": 3.9278405623806914e-07, + "loss": 0.9203, + "step": 5385 + }, + { + "epoch": 0.9132683340398474, + "grad_norm": 0.6757902629308058, + "learning_rate": 3.912613166455881e-07, + "loss": 0.7678, + "step": 5386 + }, + { + "epoch": 0.9134378974141586, + "grad_norm": 0.9723433186510121, + "learning_rate": 3.897414755581463e-07, + "loss": 0.8983, + "step": 5387 + }, + { + "epoch": 0.9136074607884697, + "grad_norm": 0.9661531137532681, + "learning_rate": 3.882245334342061e-07, + "loss": 0.921, + "step": 5388 + }, + { + "epoch": 0.9137770241627808, + "grad_norm": 0.9930248420028948, + "learning_rate": 3.867104907313557e-07, + "loss": 0.9196, + "step": 5389 + }, + { + "epoch": 0.913946587537092, + "grad_norm": 0.9761736950576347, + "learning_rate": 3.851993479063154e-07, + "loss": 0.918, + "step": 5390 + }, + { + "epoch": 0.9141161509114032, + "grad_norm": 0.9714243326655925, + "learning_rate": 3.8369110541492396e-07, + "loss": 0.9304, + "step": 5391 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 1.0111816639053728, + "learning_rate": 3.821857637121462e-07, + "loss": 0.9121, + "step": 5392 + }, + { + "epoch": 0.9144552776600254, + "grad_norm": 0.9562584743620902, + "learning_rate": 3.806833232520746e-07, + "loss": 0.9187, + "step": 5393 + }, + { + "epoch": 0.9146248410343366, + "grad_norm": 0.9179208036264318, + "learning_rate": 3.7918378448792316e-07, + "loss": 0.9016, + "step": 5394 + }, + { + "epoch": 0.9147944044086477, + "grad_norm": 1.0318724655216793, + "learning_rate": 3.776871478720334e-07, + "loss": 0.9738, + "step": 5395 + }, + { + "epoch": 0.9149639677829589, + "grad_norm": 0.9877189104966257, + "learning_rate": 3.761934138558687e-07, + "loss": 0.9345, + "step": 5396 + }, + { + "epoch": 0.91513353115727, + "grad_norm": 0.961421111491527, + "learning_rate": 3.747025828900208e-07, + "loss": 0.9049, + "step": 5397 + }, + { + "epoch": 0.9153030945315812, + "grad_norm": 0.9740789070011456, + "learning_rate": 3.732146554242022e-07, + "loss": 0.903, + "step": 5398 + }, + { + "epoch": 0.9154726579058923, + "grad_norm": 0.9550882789299242, + "learning_rate": 3.7172963190725164e-07, + "loss": 0.9378, + "step": 5399 + }, + { + "epoch": 0.9156422212802034, + "grad_norm": 0.9575834152999672, + "learning_rate": 3.7024751278712744e-07, + "loss": 0.9141, + "step": 5400 + }, + { + "epoch": 0.9158117846545146, + "grad_norm": 0.9766625715813362, + "learning_rate": 3.687682985109209e-07, + "loss": 0.9368, + "step": 5401 + }, + { + "epoch": 0.9159813480288258, + "grad_norm": 0.959606068005571, + "learning_rate": 3.6729198952483725e-07, + "loss": 0.9661, + "step": 5402 + }, + { + "epoch": 0.9161509114031369, + "grad_norm": 0.9831822183201825, + "learning_rate": 3.658185862742103e-07, + "loss": 0.9585, + "step": 5403 + }, + { + "epoch": 0.916320474777448, + "grad_norm": 0.9194956259616271, + "learning_rate": 3.6434808920349787e-07, + "loss": 0.8483, + "step": 5404 + }, + { + "epoch": 0.9164900381517592, + "grad_norm": 0.9460217958792853, + "learning_rate": 3.628804987562795e-07, + "loss": 0.9456, + "step": 5405 + }, + { + "epoch": 0.9166596015260704, + "grad_norm": 0.932499906714206, + "learning_rate": 3.614158153752578e-07, + "loss": 0.8892, + "step": 5406 + }, + { + "epoch": 0.9168291649003815, + "grad_norm": 0.9502948231957661, + "learning_rate": 3.599540395022583e-07, + "loss": 0.901, + "step": 5407 + }, + { + "epoch": 0.9169987282746926, + "grad_norm": 0.9920578897311115, + "learning_rate": 3.5849517157823143e-07, + "loss": 0.9221, + "step": 5408 + }, + { + "epoch": 0.9171682916490038, + "grad_norm": 0.9313523808453857, + "learning_rate": 3.5703921204324863e-07, + "loss": 0.9135, + "step": 5409 + }, + { + "epoch": 0.917337855023315, + "grad_norm": 0.9746902661040927, + "learning_rate": 3.5558616133650413e-07, + "loss": 0.9033, + "step": 5410 + }, + { + "epoch": 0.9175074183976261, + "grad_norm": 1.0035283102741095, + "learning_rate": 3.5413601989631616e-07, + "loss": 0.9315, + "step": 5411 + }, + { + "epoch": 0.9176769817719372, + "grad_norm": 1.0048795473133891, + "learning_rate": 3.5268878816012265e-07, + "loss": 0.9262, + "step": 5412 + }, + { + "epoch": 0.9178465451462484, + "grad_norm": 0.9826734454281971, + "learning_rate": 3.5124446656448654e-07, + "loss": 0.9665, + "step": 5413 + }, + { + "epoch": 0.9180161085205596, + "grad_norm": 1.0102837678151142, + "learning_rate": 3.498030555450904e-07, + "loss": 0.9326, + "step": 5414 + }, + { + "epoch": 0.9181856718948707, + "grad_norm": 0.9855348954572305, + "learning_rate": 3.483645555367421e-07, + "loss": 0.9476, + "step": 5415 + }, + { + "epoch": 0.9183552352691818, + "grad_norm": 0.9339472132343781, + "learning_rate": 3.4692896697336887e-07, + "loss": 0.9005, + "step": 5416 + }, + { + "epoch": 0.918524798643493, + "grad_norm": 0.9639269646676377, + "learning_rate": 3.454962902880199e-07, + "loss": 0.9276, + "step": 5417 + }, + { + "epoch": 0.9186943620178042, + "grad_norm": 1.0275373407904178, + "learning_rate": 3.4406652591286507e-07, + "loss": 0.9558, + "step": 5418 + }, + { + "epoch": 0.9188639253921153, + "grad_norm": 0.9610171386420859, + "learning_rate": 3.426396742792004e-07, + "loss": 0.9313, + "step": 5419 + }, + { + "epoch": 0.9190334887664264, + "grad_norm": 0.9826964548150668, + "learning_rate": 3.412157358174384e-07, + "loss": 0.931, + "step": 5420 + }, + { + "epoch": 0.9192030521407376, + "grad_norm": 0.5864439574398642, + "learning_rate": 3.397947109571131e-07, + "loss": 0.7674, + "step": 5421 + }, + { + "epoch": 0.9193726155150488, + "grad_norm": 0.9420992595847931, + "learning_rate": 3.38376600126884e-07, + "loss": 0.9384, + "step": 5422 + }, + { + "epoch": 0.9195421788893599, + "grad_norm": 1.004375833066848, + "learning_rate": 3.3696140375452544e-07, + "loss": 0.8952, + "step": 5423 + }, + { + "epoch": 0.919711742263671, + "grad_norm": 0.958464811356412, + "learning_rate": 3.3554912226693714e-07, + "loss": 0.9192, + "step": 5424 + }, + { + "epoch": 0.9198813056379822, + "grad_norm": 0.9992589274460407, + "learning_rate": 3.3413975609013713e-07, + "loss": 0.9297, + "step": 5425 + }, + { + "epoch": 0.9200508690122934, + "grad_norm": 0.9602548891218543, + "learning_rate": 3.3273330564926766e-07, + "loss": 0.8977, + "step": 5426 + }, + { + "epoch": 0.9202204323866044, + "grad_norm": 0.9564208966657699, + "learning_rate": 3.313297713685859e-07, + "loss": 0.9112, + "step": 5427 + }, + { + "epoch": 0.9203899957609156, + "grad_norm": 0.6551809507934392, + "learning_rate": 3.299291536714722e-07, + "loss": 0.7744, + "step": 5428 + }, + { + "epoch": 0.9205595591352268, + "grad_norm": 0.9909382809102839, + "learning_rate": 3.2853145298042954e-07, + "loss": 0.8767, + "step": 5429 + }, + { + "epoch": 0.920729122509538, + "grad_norm": 0.9836627492482466, + "learning_rate": 3.271366697170764e-07, + "loss": 0.9117, + "step": 5430 + }, + { + "epoch": 0.920898685883849, + "grad_norm": 0.9395079072695641, + "learning_rate": 3.257448043021538e-07, + "loss": 0.9016, + "step": 5431 + }, + { + "epoch": 0.9210682492581602, + "grad_norm": 0.9120250231051668, + "learning_rate": 3.2435585715552164e-07, + "loss": 0.9061, + "step": 5432 + }, + { + "epoch": 0.9212378126324714, + "grad_norm": 0.9802404822411753, + "learning_rate": 3.2296982869616134e-07, + "loss": 0.9278, + "step": 5433 + }, + { + "epoch": 0.9214073760067826, + "grad_norm": 1.0110692105201524, + "learning_rate": 3.215867193421718e-07, + "loss": 0.9559, + "step": 5434 + }, + { + "epoch": 0.9215769393810936, + "grad_norm": 0.9766251321150854, + "learning_rate": 3.2020652951077256e-07, + "loss": 0.9198, + "step": 5435 + }, + { + "epoch": 0.9217465027554048, + "grad_norm": 0.9525585021322519, + "learning_rate": 3.188292596183007e-07, + "loss": 0.8835, + "step": 5436 + }, + { + "epoch": 0.921916066129716, + "grad_norm": 0.9602038533099616, + "learning_rate": 3.1745491008021603e-07, + "loss": 0.8949, + "step": 5437 + }, + { + "epoch": 0.9220856295040272, + "grad_norm": 1.003781034042751, + "learning_rate": 3.160834813110947e-07, + "loss": 0.9369, + "step": 5438 + }, + { + "epoch": 0.9222551928783382, + "grad_norm": 0.7104834836846216, + "learning_rate": 3.147149737246302e-07, + "loss": 0.7674, + "step": 5439 + }, + { + "epoch": 0.9224247562526494, + "grad_norm": 0.9800090606314543, + "learning_rate": 3.133493877336391e-07, + "loss": 0.9215, + "step": 5440 + }, + { + "epoch": 0.9225943196269606, + "grad_norm": 0.9333006892043587, + "learning_rate": 3.1198672375005403e-07, + "loss": 0.8994, + "step": 5441 + }, + { + "epoch": 0.9227638830012718, + "grad_norm": 0.9514621600841879, + "learning_rate": 3.106269821849273e-07, + "loss": 0.9277, + "step": 5442 + }, + { + "epoch": 0.9229334463755828, + "grad_norm": 0.9539791701091482, + "learning_rate": 3.092701634484274e-07, + "loss": 0.9022, + "step": 5443 + }, + { + "epoch": 0.923103009749894, + "grad_norm": 1.190105551687011, + "learning_rate": 3.0791626794984377e-07, + "loss": 0.8983, + "step": 5444 + }, + { + "epoch": 0.9232725731242052, + "grad_norm": 0.9932389376448827, + "learning_rate": 3.06565296097584e-07, + "loss": 0.9203, + "step": 5445 + }, + { + "epoch": 0.9234421364985164, + "grad_norm": 0.9788596238609185, + "learning_rate": 3.052172482991711e-07, + "loss": 0.9624, + "step": 5446 + }, + { + "epoch": 0.9236116998728274, + "grad_norm": 1.0380243004597096, + "learning_rate": 3.0387212496124974e-07, + "loss": 0.9219, + "step": 5447 + }, + { + "epoch": 0.9237812632471386, + "grad_norm": 1.0113902300688296, + "learning_rate": 3.025299264895787e-07, + "loss": 0.8886, + "step": 5448 + }, + { + "epoch": 0.9239508266214498, + "grad_norm": 0.63149154328108, + "learning_rate": 3.0119065328903517e-07, + "loss": 0.7782, + "step": 5449 + }, + { + "epoch": 0.924120389995761, + "grad_norm": 0.5589468374793891, + "learning_rate": 2.998543057636183e-07, + "loss": 0.7202, + "step": 5450 + }, + { + "epoch": 0.924289953370072, + "grad_norm": 0.9780248892759397, + "learning_rate": 2.9852088431644e-07, + "loss": 0.8921, + "step": 5451 + }, + { + "epoch": 0.9244595167443832, + "grad_norm": 0.9939655788799581, + "learning_rate": 2.9719038934972964e-07, + "loss": 0.8924, + "step": 5452 + }, + { + "epoch": 0.9246290801186944, + "grad_norm": 0.654090335724019, + "learning_rate": 2.9586282126483625e-07, + "loss": 0.798, + "step": 5453 + }, + { + "epoch": 0.9247986434930056, + "grad_norm": 0.9874470089105412, + "learning_rate": 2.945381804622238e-07, + "loss": 0.9116, + "step": 5454 + }, + { + "epoch": 0.9249682068673166, + "grad_norm": 0.9990081153474144, + "learning_rate": 2.93216467341475e-07, + "loss": 0.9166, + "step": 5455 + }, + { + "epoch": 0.9251377702416278, + "grad_norm": 0.9977786065598294, + "learning_rate": 2.918976823012887e-07, + "loss": 0.9261, + "step": 5456 + }, + { + "epoch": 0.925307333615939, + "grad_norm": 0.9786420251410374, + "learning_rate": 2.905818257394799e-07, + "loss": 0.8987, + "step": 5457 + }, + { + "epoch": 0.9254768969902502, + "grad_norm": 0.9351485593254598, + "learning_rate": 2.892688980529812e-07, + "loss": 0.9137, + "step": 5458 + }, + { + "epoch": 0.9256464603645612, + "grad_norm": 0.9881128074961543, + "learning_rate": 2.879588996378402e-07, + "loss": 0.914, + "step": 5459 + }, + { + "epoch": 0.9258160237388724, + "grad_norm": 0.9709518774562353, + "learning_rate": 2.86651830889223e-07, + "loss": 0.9463, + "step": 5460 + }, + { + "epoch": 0.9259855871131836, + "grad_norm": 0.9811259952601982, + "learning_rate": 2.853476922014098e-07, + "loss": 0.9102, + "step": 5461 + }, + { + "epoch": 0.9261551504874947, + "grad_norm": 0.9686376678290065, + "learning_rate": 2.840464839677992e-07, + "loss": 0.9348, + "step": 5462 + }, + { + "epoch": 0.9263247138618058, + "grad_norm": 1.005719347351487, + "learning_rate": 2.8274820658090506e-07, + "loss": 0.9297, + "step": 5463 + }, + { + "epoch": 0.926494277236117, + "grad_norm": 1.0367614580892663, + "learning_rate": 2.8145286043235407e-07, + "loss": 0.97, + "step": 5464 + }, + { + "epoch": 0.9266638406104282, + "grad_norm": 0.9836851328703371, + "learning_rate": 2.801604459128926e-07, + "loss": 0.9272, + "step": 5465 + }, + { + "epoch": 0.9268334039847393, + "grad_norm": 0.9650917589108308, + "learning_rate": 2.788709634123821e-07, + "loss": 0.9371, + "step": 5466 + }, + { + "epoch": 0.9270029673590504, + "grad_norm": 0.9800319300033921, + "learning_rate": 2.7758441331979914e-07, + "loss": 0.8935, + "step": 5467 + }, + { + "epoch": 0.9271725307333616, + "grad_norm": 0.9911754307768186, + "learning_rate": 2.7630079602323447e-07, + "loss": 0.9414, + "step": 5468 + }, + { + "epoch": 0.9273420941076728, + "grad_norm": 0.9895292365279797, + "learning_rate": 2.75020111909895e-07, + "loss": 0.9144, + "step": 5469 + }, + { + "epoch": 0.9275116574819839, + "grad_norm": 0.9765261342110486, + "learning_rate": 2.737423613661028e-07, + "loss": 0.9608, + "step": 5470 + }, + { + "epoch": 0.927681220856295, + "grad_norm": 0.9879221227215401, + "learning_rate": 2.724675447772973e-07, + "loss": 0.9401, + "step": 5471 + }, + { + "epoch": 0.9278507842306062, + "grad_norm": 1.0071323980106321, + "learning_rate": 2.7119566252802656e-07, + "loss": 0.9525, + "step": 5472 + }, + { + "epoch": 0.9280203476049174, + "grad_norm": 0.9824768063606365, + "learning_rate": 2.6992671500196134e-07, + "loss": 0.9341, + "step": 5473 + }, + { + "epoch": 0.9281899109792285, + "grad_norm": 0.9459273440730945, + "learning_rate": 2.6866070258188324e-07, + "loss": 0.9023, + "step": 5474 + }, + { + "epoch": 0.9283594743535396, + "grad_norm": 0.9764163138743068, + "learning_rate": 2.6739762564968686e-07, + "loss": 0.8964, + "step": 5475 + }, + { + "epoch": 0.9285290377278508, + "grad_norm": 0.9819088893037986, + "learning_rate": 2.661374845863851e-07, + "loss": 0.9378, + "step": 5476 + }, + { + "epoch": 0.928698601102162, + "grad_norm": 0.9508932159708138, + "learning_rate": 2.6488027977210175e-07, + "loss": 0.853, + "step": 5477 + }, + { + "epoch": 0.9288681644764731, + "grad_norm": 0.6419977853689585, + "learning_rate": 2.636260115860778e-07, + "loss": 0.8053, + "step": 5478 + }, + { + "epoch": 0.9290377278507842, + "grad_norm": 0.9693479238780413, + "learning_rate": 2.6237468040666515e-07, + "loss": 0.9005, + "step": 5479 + }, + { + "epoch": 0.9292072912250954, + "grad_norm": 0.9556824438132773, + "learning_rate": 2.61126286611334e-07, + "loss": 0.9, + "step": 5480 + }, + { + "epoch": 0.9293768545994066, + "grad_norm": 1.039279132784763, + "learning_rate": 2.5988083057666534e-07, + "loss": 0.9347, + "step": 5481 + }, + { + "epoch": 0.9295464179737177, + "grad_norm": 0.995902014678631, + "learning_rate": 2.586383126783532e-07, + "loss": 0.9244, + "step": 5482 + }, + { + "epoch": 0.9297159813480288, + "grad_norm": 0.9407804494372483, + "learning_rate": 2.573987332912087e-07, + "loss": 0.9263, + "step": 5483 + }, + { + "epoch": 0.92988554472234, + "grad_norm": 0.911538157433653, + "learning_rate": 2.561620927891539e-07, + "loss": 0.9135, + "step": 5484 + }, + { + "epoch": 0.9300551080966512, + "grad_norm": 0.9520464880655851, + "learning_rate": 2.5492839154522495e-07, + "loss": 0.8978, + "step": 5485 + }, + { + "epoch": 0.9302246714709623, + "grad_norm": 0.9867480022884948, + "learning_rate": 2.53697629931573e-07, + "loss": 0.9329, + "step": 5486 + }, + { + "epoch": 0.9303942348452734, + "grad_norm": 0.6857170062643512, + "learning_rate": 2.5246980831945877e-07, + "loss": 0.7953, + "step": 5487 + }, + { + "epoch": 0.9305637982195846, + "grad_norm": 0.9641602117190036, + "learning_rate": 2.512449270792594e-07, + "loss": 0.9165, + "step": 5488 + }, + { + "epoch": 0.9307333615938957, + "grad_norm": 0.9319781517032217, + "learning_rate": 2.5002298658046484e-07, + "loss": 0.9119, + "step": 5489 + }, + { + "epoch": 0.9309029249682068, + "grad_norm": 0.9683516646842466, + "learning_rate": 2.4880398719167584e-07, + "loss": 0.9381, + "step": 5490 + }, + { + "epoch": 0.931072488342518, + "grad_norm": 1.021949784427159, + "learning_rate": 2.4758792928060715e-07, + "loss": 0.9907, + "step": 5491 + }, + { + "epoch": 0.9312420517168292, + "grad_norm": 0.987326968389827, + "learning_rate": 2.4637481321408863e-07, + "loss": 0.8731, + "step": 5492 + }, + { + "epoch": 0.9314116150911403, + "grad_norm": 0.9508026560198607, + "learning_rate": 2.4516463935805644e-07, + "loss": 0.9599, + "step": 5493 + }, + { + "epoch": 0.9315811784654514, + "grad_norm": 0.9332219601549946, + "learning_rate": 2.439574080775675e-07, + "loss": 0.9106, + "step": 5494 + }, + { + "epoch": 0.9317507418397626, + "grad_norm": 0.9766262841226447, + "learning_rate": 2.4275311973678384e-07, + "loss": 0.8992, + "step": 5495 + }, + { + "epoch": 0.9319203052140738, + "grad_norm": 0.9333527754702658, + "learning_rate": 2.4155177469898373e-07, + "loss": 0.9176, + "step": 5496 + }, + { + "epoch": 0.9320898685883849, + "grad_norm": 0.9668309153421941, + "learning_rate": 2.4035337332655504e-07, + "loss": 0.9181, + "step": 5497 + }, + { + "epoch": 0.932259431962696, + "grad_norm": 0.9820661538244929, + "learning_rate": 2.3915791598100205e-07, + "loss": 0.9398, + "step": 5498 + }, + { + "epoch": 0.9324289953370072, + "grad_norm": 0.9963045821264599, + "learning_rate": 2.3796540302293724e-07, + "loss": 0.9039, + "step": 5499 + }, + { + "epoch": 0.9325985587113184, + "grad_norm": 0.9417061394871139, + "learning_rate": 2.36775834812083e-07, + "loss": 0.9056, + "step": 5500 + }, + { + "epoch": 0.9327681220856295, + "grad_norm": 0.9791666419445099, + "learning_rate": 2.355892117072789e-07, + "loss": 0.9131, + "step": 5501 + }, + { + "epoch": 0.9329376854599406, + "grad_norm": 0.9692132121935689, + "learning_rate": 2.3440553406647305e-07, + "loss": 0.9017, + "step": 5502 + }, + { + "epoch": 0.9331072488342518, + "grad_norm": 0.985485094805859, + "learning_rate": 2.332248022467254e-07, + "loss": 0.8937, + "step": 5503 + }, + { + "epoch": 0.933276812208563, + "grad_norm": 0.9505013258552553, + "learning_rate": 2.320470166042066e-07, + "loss": 0.887, + "step": 5504 + }, + { + "epoch": 0.9334463755828741, + "grad_norm": 0.982934702945291, + "learning_rate": 2.308721774941991e-07, + "loss": 0.8981, + "step": 5505 + }, + { + "epoch": 0.9336159389571852, + "grad_norm": 0.9607421256328981, + "learning_rate": 2.2970028527109724e-07, + "loss": 0.8796, + "step": 5506 + }, + { + "epoch": 0.9337855023314964, + "grad_norm": 0.969513999784764, + "learning_rate": 2.2853134028840594e-07, + "loss": 0.9252, + "step": 5507 + }, + { + "epoch": 0.9339550657058076, + "grad_norm": 0.9600935230399994, + "learning_rate": 2.273653428987399e-07, + "loss": 0.8967, + "step": 5508 + }, + { + "epoch": 0.9341246290801187, + "grad_norm": 0.9694123817405524, + "learning_rate": 2.262022934538266e-07, + "loss": 0.8888, + "step": 5509 + }, + { + "epoch": 0.9342941924544298, + "grad_norm": 0.5898540136692917, + "learning_rate": 2.2504219230450431e-07, + "loss": 0.7494, + "step": 5510 + }, + { + "epoch": 0.934463755828741, + "grad_norm": 0.9696883623750306, + "learning_rate": 2.2388503980071862e-07, + "loss": 0.9072, + "step": 5511 + }, + { + "epoch": 0.9346333192030521, + "grad_norm": 0.9573773378137203, + "learning_rate": 2.2273083629153148e-07, + "loss": 0.8997, + "step": 5512 + }, + { + "epoch": 0.9348028825773633, + "grad_norm": 0.9797542895525152, + "learning_rate": 2.2157958212510877e-07, + "loss": 0.9218, + "step": 5513 + }, + { + "epoch": 0.9349724459516744, + "grad_norm": 0.9985364081071842, + "learning_rate": 2.2043127764873162e-07, + "loss": 0.9249, + "step": 5514 + }, + { + "epoch": 0.9351420093259856, + "grad_norm": 0.6558730867826927, + "learning_rate": 2.192859232087885e-07, + "loss": 0.7905, + "step": 5515 + }, + { + "epoch": 0.9353115727002967, + "grad_norm": 0.9715127965895606, + "learning_rate": 2.181435191507797e-07, + "loss": 0.9163, + "step": 5516 + }, + { + "epoch": 0.9354811360746079, + "grad_norm": 0.6849751762026847, + "learning_rate": 2.1700406581931398e-07, + "loss": 0.7699, + "step": 5517 + }, + { + "epoch": 0.935650699448919, + "grad_norm": 0.9929621706345273, + "learning_rate": 2.15867563558112e-07, + "loss": 0.9099, + "step": 5518 + }, + { + "epoch": 0.9358202628232302, + "grad_norm": 0.9756665543966292, + "learning_rate": 2.1473401271000283e-07, + "loss": 0.8924, + "step": 5519 + }, + { + "epoch": 0.9359898261975413, + "grad_norm": 0.9919281699363885, + "learning_rate": 2.1360341361692517e-07, + "loss": 0.8788, + "step": 5520 + }, + { + "epoch": 0.9361593895718525, + "grad_norm": 0.9907203119270258, + "learning_rate": 2.124757666199273e-07, + "loss": 0.9473, + "step": 5521 + }, + { + "epoch": 0.9363289529461636, + "grad_norm": 0.9740373524020767, + "learning_rate": 2.1135107205916826e-07, + "loss": 0.9202, + "step": 5522 + }, + { + "epoch": 0.9364985163204748, + "grad_norm": 0.9799372514367808, + "learning_rate": 2.1022933027391555e-07, + "loss": 0.9651, + "step": 5523 + }, + { + "epoch": 0.9366680796947859, + "grad_norm": 0.6274185839242921, + "learning_rate": 2.0911054160254517e-07, + "loss": 0.7484, + "step": 5524 + }, + { + "epoch": 0.9368376430690971, + "grad_norm": 0.9837883547575718, + "learning_rate": 2.079947063825427e-07, + "loss": 0.9584, + "step": 5525 + }, + { + "epoch": 0.9370072064434082, + "grad_norm": 0.9454725601288959, + "learning_rate": 2.0688182495050446e-07, + "loss": 0.8732, + "step": 5526 + }, + { + "epoch": 0.9371767698177194, + "grad_norm": 0.9483920164930612, + "learning_rate": 2.057718976421341e-07, + "loss": 0.8729, + "step": 5527 + }, + { + "epoch": 0.9373463331920305, + "grad_norm": 0.6039447323116278, + "learning_rate": 2.0466492479224387e-07, + "loss": 0.7654, + "step": 5528 + }, + { + "epoch": 0.9375158965663417, + "grad_norm": 0.9731955916171043, + "learning_rate": 2.035609067347566e-07, + "loss": 0.9243, + "step": 5529 + }, + { + "epoch": 0.9376854599406528, + "grad_norm": 0.9312673364514508, + "learning_rate": 2.0245984380270145e-07, + "loss": 0.8914, + "step": 5530 + }, + { + "epoch": 0.937855023314964, + "grad_norm": 0.9656315441515511, + "learning_rate": 2.0136173632821944e-07, + "loss": 0.8916, + "step": 5531 + }, + { + "epoch": 0.9380245866892751, + "grad_norm": 0.968091760917464, + "learning_rate": 2.0026658464255554e-07, + "loss": 0.9207, + "step": 5532 + }, + { + "epoch": 0.9381941500635863, + "grad_norm": 0.9355634426300802, + "learning_rate": 1.9917438907606556e-07, + "loss": 0.9358, + "step": 5533 + }, + { + "epoch": 0.9383637134378974, + "grad_norm": 0.9819685760318001, + "learning_rate": 1.9808514995821592e-07, + "loss": 0.8975, + "step": 5534 + }, + { + "epoch": 0.9385332768122086, + "grad_norm": 0.9444549546433799, + "learning_rate": 1.9699886761757826e-07, + "loss": 0.8914, + "step": 5535 + }, + { + "epoch": 0.9387028401865197, + "grad_norm": 0.9749889066526239, + "learning_rate": 1.959155423818304e-07, + "loss": 0.9295, + "step": 5536 + }, + { + "epoch": 0.9388724035608309, + "grad_norm": 0.9715401945162648, + "learning_rate": 1.9483517457776436e-07, + "loss": 0.9077, + "step": 5537 + }, + { + "epoch": 0.939041966935142, + "grad_norm": 0.9726166793665533, + "learning_rate": 1.937577645312738e-07, + "loss": 0.9198, + "step": 5538 + }, + { + "epoch": 0.9392115303094531, + "grad_norm": 0.9963681974344389, + "learning_rate": 1.926833125673633e-07, + "loss": 0.9248, + "step": 5539 + }, + { + "epoch": 0.9393810936837643, + "grad_norm": 0.9859102992739459, + "learning_rate": 1.916118190101457e-07, + "loss": 0.9399, + "step": 5540 + }, + { + "epoch": 0.9395506570580755, + "grad_norm": 0.9568208210463655, + "learning_rate": 1.9054328418283808e-07, + "loss": 0.9061, + "step": 5541 + }, + { + "epoch": 0.9397202204323866, + "grad_norm": 0.9686416629208863, + "learning_rate": 1.8947770840776925e-07, + "loss": 0.9496, + "step": 5542 + }, + { + "epoch": 0.9398897838066977, + "grad_norm": 0.9391330366724194, + "learning_rate": 1.884150920063721e-07, + "loss": 0.9409, + "step": 5543 + }, + { + "epoch": 0.9400593471810089, + "grad_norm": 0.9426771707889775, + "learning_rate": 1.87355435299188e-07, + "loss": 0.9143, + "step": 5544 + }, + { + "epoch": 0.9402289105553201, + "grad_norm": 1.0253838406119677, + "learning_rate": 1.8629873860586567e-07, + "loss": 0.9308, + "step": 5545 + }, + { + "epoch": 0.9403984739296312, + "grad_norm": 0.9804098725527529, + "learning_rate": 1.852450022451624e-07, + "loss": 0.9084, + "step": 5546 + }, + { + "epoch": 0.9405680373039423, + "grad_norm": 0.9933360669735185, + "learning_rate": 1.8419422653493835e-07, + "loss": 0.909, + "step": 5547 + }, + { + "epoch": 0.9407376006782535, + "grad_norm": 0.984649373396994, + "learning_rate": 1.8314641179216663e-07, + "loss": 0.919, + "step": 5548 + }, + { + "epoch": 0.9409071640525647, + "grad_norm": 0.9329842295678624, + "learning_rate": 1.8210155833291998e-07, + "loss": 0.8807, + "step": 5549 + }, + { + "epoch": 0.9410767274268758, + "grad_norm": 0.9969967261709832, + "learning_rate": 1.8105966647238515e-07, + "loss": 0.9447, + "step": 5550 + }, + { + "epoch": 0.9412462908011869, + "grad_norm": 0.9820930359991815, + "learning_rate": 1.8002073652484852e-07, + "loss": 0.9366, + "step": 5551 + }, + { + "epoch": 0.9414158541754981, + "grad_norm": 0.5773918879392964, + "learning_rate": 1.789847688037083e-07, + "loss": 0.7294, + "step": 5552 + }, + { + "epoch": 0.9415854175498093, + "grad_norm": 0.9188618466256349, + "learning_rate": 1.7795176362146783e-07, + "loss": 0.9154, + "step": 5553 + }, + { + "epoch": 0.9417549809241204, + "grad_norm": 0.9438107286937366, + "learning_rate": 1.769217212897345e-07, + "loss": 0.9052, + "step": 5554 + }, + { + "epoch": 0.9419245442984315, + "grad_norm": 0.9839383401916435, + "learning_rate": 1.7589464211922537e-07, + "loss": 0.9183, + "step": 5555 + }, + { + "epoch": 0.9420941076727427, + "grad_norm": 0.9691832452196864, + "learning_rate": 1.748705264197603e-07, + "loss": 0.9165, + "step": 5556 + }, + { + "epoch": 0.9422636710470539, + "grad_norm": 0.9533571444589288, + "learning_rate": 1.7384937450026895e-07, + "loss": 0.9013, + "step": 5557 + }, + { + "epoch": 0.942433234421365, + "grad_norm": 0.999708131973898, + "learning_rate": 1.7283118666878374e-07, + "loss": 0.9423, + "step": 5558 + }, + { + "epoch": 0.9426027977956761, + "grad_norm": 0.653302681894168, + "learning_rate": 1.7181596323244453e-07, + "loss": 0.7422, + "step": 5559 + }, + { + "epoch": 0.9427723611699873, + "grad_norm": 0.9892735021750785, + "learning_rate": 1.7080370449749528e-07, + "loss": 0.9112, + "step": 5560 + }, + { + "epoch": 0.9429419245442985, + "grad_norm": 0.5708981756493947, + "learning_rate": 1.6979441076928837e-07, + "loss": 0.7875, + "step": 5561 + }, + { + "epoch": 0.9431114879186095, + "grad_norm": 1.0079362041064297, + "learning_rate": 1.6878808235227806e-07, + "loss": 0.9342, + "step": 5562 + }, + { + "epoch": 0.9432810512929207, + "grad_norm": 0.994009404217517, + "learning_rate": 1.677847195500304e-07, + "loss": 0.9109, + "step": 5563 + }, + { + "epoch": 0.9434506146672319, + "grad_norm": 0.9775203300782569, + "learning_rate": 1.6678432266520882e-07, + "loss": 0.9324, + "step": 5564 + }, + { + "epoch": 0.9436201780415431, + "grad_norm": 1.00607211286155, + "learning_rate": 1.6578689199958753e-07, + "loss": 0.9196, + "step": 5565 + }, + { + "epoch": 0.9437897414158541, + "grad_norm": 0.9548045689969014, + "learning_rate": 1.647924278540447e-07, + "loss": 0.9438, + "step": 5566 + }, + { + "epoch": 0.9439593047901653, + "grad_norm": 0.9707667507146128, + "learning_rate": 1.6380093052856482e-07, + "loss": 0.9233, + "step": 5567 + }, + { + "epoch": 0.9441288681644765, + "grad_norm": 0.9081739895660971, + "learning_rate": 1.628124003222331e-07, + "loss": 0.9158, + "step": 5568 + }, + { + "epoch": 0.9442984315387877, + "grad_norm": 0.9580113632717862, + "learning_rate": 1.6182683753324435e-07, + "loss": 0.9248, + "step": 5569 + }, + { + "epoch": 0.9444679949130987, + "grad_norm": 1.0202207410989919, + "learning_rate": 1.6084424245889628e-07, + "loss": 0.9215, + "step": 5570 + }, + { + "epoch": 0.9446375582874099, + "grad_norm": 0.9239649460492493, + "learning_rate": 1.5986461539559294e-07, + "loss": 0.8863, + "step": 5571 + }, + { + "epoch": 0.9448071216617211, + "grad_norm": 0.9615469796975518, + "learning_rate": 1.5888795663883904e-07, + "loss": 0.9344, + "step": 5572 + }, + { + "epoch": 0.9449766850360323, + "grad_norm": 0.9917975247635588, + "learning_rate": 1.5791426648324893e-07, + "loss": 0.9329, + "step": 5573 + }, + { + "epoch": 0.9451462484103433, + "grad_norm": 0.9623696250200654, + "learning_rate": 1.5694354522253763e-07, + "loss": 0.9385, + "step": 5574 + }, + { + "epoch": 0.9453158117846545, + "grad_norm": 0.9719410098060581, + "learning_rate": 1.5597579314952872e-07, + "loss": 0.919, + "step": 5575 + }, + { + "epoch": 0.9454853751589657, + "grad_norm": 0.9292574672349995, + "learning_rate": 1.550110105561442e-07, + "loss": 0.9224, + "step": 5576 + }, + { + "epoch": 0.9456549385332769, + "grad_norm": 0.9622159312119182, + "learning_rate": 1.5404919773341576e-07, + "loss": 0.8969, + "step": 5577 + }, + { + "epoch": 0.9458245019075879, + "grad_norm": 0.8900848039540217, + "learning_rate": 1.5309035497147685e-07, + "loss": 0.9167, + "step": 5578 + }, + { + "epoch": 0.9459940652818991, + "grad_norm": 0.9567406557746101, + "learning_rate": 1.5213448255956498e-07, + "loss": 0.9223, + "step": 5579 + }, + { + "epoch": 0.9461636286562103, + "grad_norm": 0.9649112549224468, + "learning_rate": 1.5118158078602174e-07, + "loss": 0.9278, + "step": 5580 + }, + { + "epoch": 0.9463331920305214, + "grad_norm": 0.9899204061293503, + "learning_rate": 1.5023164993829277e-07, + "loss": 0.9634, + "step": 5581 + }, + { + "epoch": 0.9465027554048325, + "grad_norm": 0.9910174614858378, + "learning_rate": 1.492846903029288e-07, + "loss": 0.9543, + "step": 5582 + }, + { + "epoch": 0.9466723187791437, + "grad_norm": 1.1165891299652846, + "learning_rate": 1.4834070216558138e-07, + "loss": 0.934, + "step": 5583 + }, + { + "epoch": 0.9468418821534549, + "grad_norm": 0.9531604091243622, + "learning_rate": 1.4739968581100827e-07, + "loss": 0.9056, + "step": 5584 + }, + { + "epoch": 0.947011445527766, + "grad_norm": 0.9466397260320445, + "learning_rate": 1.464616415230702e-07, + "loss": 0.9336, + "step": 5585 + }, + { + "epoch": 0.9471810089020771, + "grad_norm": 0.6330410316796224, + "learning_rate": 1.4552656958473077e-07, + "loss": 0.7907, + "step": 5586 + }, + { + "epoch": 0.9473505722763883, + "grad_norm": 0.9728390949111491, + "learning_rate": 1.4459447027805663e-07, + "loss": 0.9321, + "step": 5587 + }, + { + "epoch": 0.9475201356506995, + "grad_norm": 0.9417407924810578, + "learning_rate": 1.4366534388421837e-07, + "loss": 0.9103, + "step": 5588 + }, + { + "epoch": 0.9476896990250105, + "grad_norm": 1.0079916016676647, + "learning_rate": 1.4273919068349184e-07, + "loss": 0.9681, + "step": 5589 + }, + { + "epoch": 0.9478592623993217, + "grad_norm": 0.9966992765724542, + "learning_rate": 1.418160109552502e-07, + "loss": 0.9136, + "step": 5590 + }, + { + "epoch": 0.9480288257736329, + "grad_norm": 1.0172678390116543, + "learning_rate": 1.4089580497797738e-07, + "loss": 0.888, + "step": 5591 + }, + { + "epoch": 0.9481983891479441, + "grad_norm": 0.9566771617983861, + "learning_rate": 1.3997857302925355e-07, + "loss": 0.9057, + "step": 5592 + }, + { + "epoch": 0.9483679525222551, + "grad_norm": 1.0312257714558524, + "learning_rate": 1.3906431538576626e-07, + "loss": 0.923, + "step": 5593 + }, + { + "epoch": 0.9485375158965663, + "grad_norm": 0.9564312245215095, + "learning_rate": 1.3815303232330267e-07, + "loss": 0.9449, + "step": 5594 + }, + { + "epoch": 0.9487070792708775, + "grad_norm": 0.9659078727613436, + "learning_rate": 1.3724472411675517e-07, + "loss": 0.8931, + "step": 5595 + }, + { + "epoch": 0.9488766426451887, + "grad_norm": 0.9634178519869354, + "learning_rate": 1.3633939104011784e-07, + "loss": 0.9148, + "step": 5596 + }, + { + "epoch": 0.9490462060194997, + "grad_norm": 0.9962956420596855, + "learning_rate": 1.354370333664845e-07, + "loss": 0.9597, + "step": 5597 + }, + { + "epoch": 0.9492157693938109, + "grad_norm": 0.6187121049586382, + "learning_rate": 1.345376513680574e-07, + "loss": 0.7834, + "step": 5598 + }, + { + "epoch": 0.9493853327681221, + "grad_norm": 1.0139951230608446, + "learning_rate": 1.3364124531613622e-07, + "loss": 0.9376, + "step": 5599 + }, + { + "epoch": 0.9495548961424333, + "grad_norm": 0.9719922818769989, + "learning_rate": 1.327478154811246e-07, + "loss": 0.9379, + "step": 5600 + }, + { + "epoch": 0.9497244595167443, + "grad_norm": 0.9591858631345868, + "learning_rate": 1.3185736213252808e-07, + "loss": 0.895, + "step": 5601 + }, + { + "epoch": 0.9498940228910555, + "grad_norm": 1.022002593311473, + "learning_rate": 1.3096988553895517e-07, + "loss": 0.9291, + "step": 5602 + }, + { + "epoch": 0.9500635862653667, + "grad_norm": 1.0316665256589774, + "learning_rate": 1.3008538596811616e-07, + "loss": 0.9481, + "step": 5603 + }, + { + "epoch": 0.9502331496396779, + "grad_norm": 1.0308684421858336, + "learning_rate": 1.2920386368682313e-07, + "loss": 0.9579, + "step": 5604 + }, + { + "epoch": 0.9504027130139889, + "grad_norm": 1.0165402809228128, + "learning_rate": 1.2832531896098788e-07, + "loss": 0.9524, + "step": 5605 + }, + { + "epoch": 0.9505722763883001, + "grad_norm": 0.942635156534935, + "learning_rate": 1.274497520556295e-07, + "loss": 0.9095, + "step": 5606 + }, + { + "epoch": 0.9507418397626113, + "grad_norm": 1.0356681253524946, + "learning_rate": 1.2657716323486224e-07, + "loss": 0.9257, + "step": 5607 + }, + { + "epoch": 0.9509114031369225, + "grad_norm": 0.9819511028991497, + "learning_rate": 1.257075527619067e-07, + "loss": 0.9337, + "step": 5608 + }, + { + "epoch": 0.9510809665112335, + "grad_norm": 0.9748422301015786, + "learning_rate": 1.2484092089908307e-07, + "loss": 0.9394, + "step": 5609 + }, + { + "epoch": 0.9512505298855447, + "grad_norm": 0.9421552096809374, + "learning_rate": 1.2397726790781438e-07, + "loss": 0.9373, + "step": 5610 + }, + { + "epoch": 0.9514200932598559, + "grad_norm": 1.0301769988605656, + "learning_rate": 1.231165940486234e-07, + "loss": 0.9349, + "step": 5611 + }, + { + "epoch": 0.9515896566341671, + "grad_norm": 0.9672446073622168, + "learning_rate": 1.2225889958113468e-07, + "loss": 0.9162, + "step": 5612 + }, + { + "epoch": 0.9517592200084781, + "grad_norm": 1.0234706563231974, + "learning_rate": 1.2140418476407457e-07, + "loss": 0.9458, + "step": 5613 + }, + { + "epoch": 0.9519287833827893, + "grad_norm": 1.0242176883224865, + "learning_rate": 1.2055244985527015e-07, + "loss": 0.9184, + "step": 5614 + }, + { + "epoch": 0.9520983467571005, + "grad_norm": 0.973263470313115, + "learning_rate": 1.1970369511165035e-07, + "loss": 0.8953, + "step": 5615 + }, + { + "epoch": 0.9522679101314117, + "grad_norm": 0.9578182638458529, + "learning_rate": 1.1885792078924375e-07, + "loss": 0.9347, + "step": 5616 + }, + { + "epoch": 0.9524374735057227, + "grad_norm": 0.5755626615677899, + "learning_rate": 1.1801512714318286e-07, + "loss": 0.7367, + "step": 5617 + }, + { + "epoch": 0.9526070368800339, + "grad_norm": 0.9502889625309332, + "learning_rate": 1.1717531442769658e-07, + "loss": 0.882, + "step": 5618 + }, + { + "epoch": 0.9527766002543451, + "grad_norm": 0.9394678663587318, + "learning_rate": 1.1633848289611783e-07, + "loss": 0.9261, + "step": 5619 + }, + { + "epoch": 0.9529461636286563, + "grad_norm": 1.0459279682755407, + "learning_rate": 1.1550463280087909e-07, + "loss": 0.9175, + "step": 5620 + }, + { + "epoch": 0.9531157270029673, + "grad_norm": 1.0038510674010808, + "learning_rate": 1.1467376439351474e-07, + "loss": 0.9375, + "step": 5621 + }, + { + "epoch": 0.9532852903772785, + "grad_norm": 0.9477082037529725, + "learning_rate": 1.1384587792465873e-07, + "loss": 0.8881, + "step": 5622 + }, + { + "epoch": 0.9534548537515897, + "grad_norm": 1.007230381478742, + "learning_rate": 1.1302097364404241e-07, + "loss": 0.929, + "step": 5623 + }, + { + "epoch": 0.9536244171259008, + "grad_norm": 0.9902014910636292, + "learning_rate": 1.121990518005045e-07, + "loss": 0.9396, + "step": 5624 + }, + { + "epoch": 0.9537939805002119, + "grad_norm": 0.9469107127998239, + "learning_rate": 1.113801126419789e-07, + "loss": 0.9476, + "step": 5625 + }, + { + "epoch": 0.9539635438745231, + "grad_norm": 1.0363518686033886, + "learning_rate": 1.1056415641550134e-07, + "loss": 0.9559, + "step": 5626 + }, + { + "epoch": 0.9541331072488343, + "grad_norm": 0.9767580486798779, + "learning_rate": 1.0975118336720603e-07, + "loss": 0.8872, + "step": 5627 + }, + { + "epoch": 0.9543026706231454, + "grad_norm": 0.9808314506206055, + "learning_rate": 1.0894119374233014e-07, + "loss": 0.9195, + "step": 5628 + }, + { + "epoch": 0.9544722339974565, + "grad_norm": 0.9146829940523473, + "learning_rate": 1.0813418778521046e-07, + "loss": 0.8736, + "step": 5629 + }, + { + "epoch": 0.9546417973717677, + "grad_norm": 0.9904559510690045, + "learning_rate": 1.0733016573928002e-07, + "loss": 0.9268, + "step": 5630 + }, + { + "epoch": 0.9548113607460789, + "grad_norm": 0.9888897543704092, + "learning_rate": 1.0652912784707592e-07, + "loss": 0.9612, + "step": 5631 + }, + { + "epoch": 0.95498092412039, + "grad_norm": 0.9802965341644693, + "learning_rate": 1.0573107435023378e-07, + "loss": 0.9215, + "step": 5632 + }, + { + "epoch": 0.9551504874947011, + "grad_norm": 0.955485779386987, + "learning_rate": 1.0493600548948879e-07, + "loss": 0.8895, + "step": 5633 + }, + { + "epoch": 0.9553200508690123, + "grad_norm": 1.0085835751406802, + "learning_rate": 1.041439215046769e-07, + "loss": 0.9104, + "step": 5634 + }, + { + "epoch": 0.9554896142433235, + "grad_norm": 0.9717792844394751, + "learning_rate": 1.0335482263473028e-07, + "loss": 0.8878, + "step": 5635 + }, + { + "epoch": 0.9556591776176346, + "grad_norm": 0.945749492533743, + "learning_rate": 1.0256870911768524e-07, + "loss": 0.9162, + "step": 5636 + }, + { + "epoch": 0.9558287409919457, + "grad_norm": 0.9761849266881163, + "learning_rate": 1.0178558119067316e-07, + "loss": 0.9254, + "step": 5637 + }, + { + "epoch": 0.9559983043662569, + "grad_norm": 0.9777727793975898, + "learning_rate": 1.0100543908992843e-07, + "loss": 0.8997, + "step": 5638 + }, + { + "epoch": 0.9561678677405681, + "grad_norm": 0.9770299570773744, + "learning_rate": 1.002282830507828e-07, + "loss": 0.9164, + "step": 5639 + }, + { + "epoch": 0.9563374311148792, + "grad_norm": 1.0206630293165762, + "learning_rate": 9.945411330766874e-08, + "loss": 0.9347, + "step": 5640 + }, + { + "epoch": 0.9565069944891903, + "grad_norm": 0.9108921607267623, + "learning_rate": 9.8682930094115e-08, + "loss": 0.898, + "step": 5641 + }, + { + "epoch": 0.9566765578635015, + "grad_norm": 1.0443273276846412, + "learning_rate": 9.791473364275328e-08, + "loss": 0.914, + "step": 5642 + }, + { + "epoch": 0.9568461212378127, + "grad_norm": 0.9706920878560784, + "learning_rate": 9.714952418531154e-08, + "loss": 0.9157, + "step": 5643 + }, + { + "epoch": 0.9570156846121238, + "grad_norm": 0.9934523266760733, + "learning_rate": 9.638730195261625e-08, + "loss": 0.9099, + "step": 5644 + }, + { + "epoch": 0.9571852479864349, + "grad_norm": 0.9815389649253861, + "learning_rate": 9.562806717459572e-08, + "loss": 0.901, + "step": 5645 + }, + { + "epoch": 0.9573548113607461, + "grad_norm": 0.9631592108966471, + "learning_rate": 9.487182008027563e-08, + "loss": 0.9446, + "step": 5646 + }, + { + "epoch": 0.9575243747350572, + "grad_norm": 0.884174264077493, + "learning_rate": 9.411856089777904e-08, + "loss": 0.9035, + "step": 5647 + }, + { + "epoch": 0.9576939381093684, + "grad_norm": 0.9519611298011554, + "learning_rate": 9.336828985432866e-08, + "loss": 0.9343, + "step": 5648 + }, + { + "epoch": 0.9578635014836795, + "grad_norm": 1.010741076170822, + "learning_rate": 9.262100717624678e-08, + "loss": 0.8845, + "step": 5649 + }, + { + "epoch": 0.9580330648579907, + "grad_norm": 1.0096867389950603, + "learning_rate": 9.187671308895418e-08, + "loss": 0.9361, + "step": 5650 + }, + { + "epoch": 0.9582026282323018, + "grad_norm": 0.992811217632035, + "learning_rate": 9.113540781696795e-08, + "loss": 0.9638, + "step": 5651 + }, + { + "epoch": 0.958372191606613, + "grad_norm": 0.9853723854520304, + "learning_rate": 9.039709158390587e-08, + "loss": 0.9322, + "step": 5652 + }, + { + "epoch": 0.9585417549809241, + "grad_norm": 0.991761359071895, + "learning_rate": 8.966176461248422e-08, + "loss": 0.9378, + "step": 5653 + }, + { + "epoch": 0.9587113183552353, + "grad_norm": 0.962925073424598, + "learning_rate": 8.892942712451447e-08, + "loss": 0.9005, + "step": 5654 + }, + { + "epoch": 0.9588808817295464, + "grad_norm": 1.0137321182282732, + "learning_rate": 8.82000793409088e-08, + "loss": 0.9227, + "step": 5655 + }, + { + "epoch": 0.9590504451038576, + "grad_norm": 0.9567330325695068, + "learning_rate": 8.747372148167787e-08, + "loss": 0.8776, + "step": 5656 + }, + { + "epoch": 0.9592200084781687, + "grad_norm": 0.9544805413220517, + "learning_rate": 8.675035376593088e-08, + "loss": 0.9257, + "step": 5657 + }, + { + "epoch": 0.9593895718524799, + "grad_norm": 1.0089489011254036, + "learning_rate": 8.602997641187217e-08, + "loss": 0.9236, + "step": 5658 + }, + { + "epoch": 0.959559135226791, + "grad_norm": 0.9390273492041998, + "learning_rate": 8.531258963680567e-08, + "loss": 0.8938, + "step": 5659 + }, + { + "epoch": 0.9597286986011022, + "grad_norm": 0.9869598819749525, + "learning_rate": 8.459819365713384e-08, + "loss": 0.9204, + "step": 5660 + }, + { + "epoch": 0.9598982619754133, + "grad_norm": 0.9590113172342194, + "learning_rate": 8.388678868835653e-08, + "loss": 0.8856, + "step": 5661 + }, + { + "epoch": 0.9600678253497245, + "grad_norm": 0.9627742367098747, + "learning_rate": 8.317837494507097e-08, + "loss": 0.9451, + "step": 5662 + }, + { + "epoch": 0.9602373887240356, + "grad_norm": 0.9925573175071802, + "learning_rate": 8.247295264097288e-08, + "loss": 0.9234, + "step": 5663 + }, + { + "epoch": 0.9604069520983468, + "grad_norm": 0.6525565568088377, + "learning_rate": 8.177052198885426e-08, + "loss": 0.7322, + "step": 5664 + }, + { + "epoch": 0.9605765154726579, + "grad_norm": 0.9880303470347077, + "learning_rate": 8.107108320060675e-08, + "loss": 0.8895, + "step": 5665 + }, + { + "epoch": 0.960746078846969, + "grad_norm": 0.9613606650954206, + "learning_rate": 8.037463648721488e-08, + "loss": 0.9345, + "step": 5666 + }, + { + "epoch": 0.9609156422212802, + "grad_norm": 1.0317409598980691, + "learning_rate": 7.96811820587684e-08, + "loss": 0.9076, + "step": 5667 + }, + { + "epoch": 0.9610852055955914, + "grad_norm": 0.9413026092247981, + "learning_rate": 7.899072012444664e-08, + "loss": 0.911, + "step": 5668 + }, + { + "epoch": 0.9612547689699025, + "grad_norm": 0.9661800906856106, + "learning_rate": 7.830325089253077e-08, + "loss": 0.9192, + "step": 5669 + }, + { + "epoch": 0.9614243323442137, + "grad_norm": 0.9960374005420917, + "learning_rate": 7.761877457039712e-08, + "loss": 0.9051, + "step": 5670 + }, + { + "epoch": 0.9615938957185248, + "grad_norm": 1.0164163523231122, + "learning_rate": 7.693729136452165e-08, + "loss": 0.9635, + "step": 5671 + }, + { + "epoch": 0.9617634590928359, + "grad_norm": 0.9686163005741266, + "learning_rate": 7.625880148047437e-08, + "loss": 0.8772, + "step": 5672 + }, + { + "epoch": 0.9619330224671471, + "grad_norm": 0.9602098714588723, + "learning_rate": 7.558330512292378e-08, + "loss": 0.9524, + "step": 5673 + }, + { + "epoch": 0.9621025858414582, + "grad_norm": 1.0509085868271995, + "learning_rate": 7.491080249563687e-08, + "loss": 0.9339, + "step": 5674 + }, + { + "epoch": 0.9622721492157694, + "grad_norm": 0.610630238315975, + "learning_rate": 7.424129380147471e-08, + "loss": 0.7404, + "step": 5675 + }, + { + "epoch": 0.9624417125900805, + "grad_norm": 0.9789889536923958, + "learning_rate": 7.357477924239797e-08, + "loss": 0.9418, + "step": 5676 + }, + { + "epoch": 0.9626112759643917, + "grad_norm": 0.9925782389575215, + "learning_rate": 7.291125901946027e-08, + "loss": 0.9241, + "step": 5677 + }, + { + "epoch": 0.9627808393387028, + "grad_norm": 0.9469651976198064, + "learning_rate": 7.225073333281707e-08, + "loss": 0.9366, + "step": 5678 + }, + { + "epoch": 0.962950402713014, + "grad_norm": 0.9815750036675475, + "learning_rate": 7.159320238171674e-08, + "loss": 0.9352, + "step": 5679 + }, + { + "epoch": 0.9631199660873251, + "grad_norm": 0.9431867658980311, + "learning_rate": 7.093866636450508e-08, + "loss": 0.9148, + "step": 5680 + }, + { + "epoch": 0.9632895294616363, + "grad_norm": 0.9345922278115185, + "learning_rate": 7.028712547862526e-08, + "loss": 0.9068, + "step": 5681 + }, + { + "epoch": 0.9634590928359474, + "grad_norm": 0.9996099228733981, + "learning_rate": 6.963857992061785e-08, + "loss": 0.9278, + "step": 5682 + }, + { + "epoch": 0.9636286562102586, + "grad_norm": 0.9603346693015956, + "learning_rate": 6.899302988611744e-08, + "loss": 0.8796, + "step": 5683 + }, + { + "epoch": 0.9637982195845697, + "grad_norm": 1.0636078220456575, + "learning_rate": 6.835047556985497e-08, + "loss": 0.9006, + "step": 5684 + }, + { + "epoch": 0.9639677829588809, + "grad_norm": 1.0039187387359212, + "learning_rate": 6.771091716566091e-08, + "loss": 0.8885, + "step": 5685 + }, + { + "epoch": 0.964137346333192, + "grad_norm": 0.9230866095184542, + "learning_rate": 6.707435486645986e-08, + "loss": 0.9108, + "step": 5686 + }, + { + "epoch": 0.9643069097075032, + "grad_norm": 0.934253790219709, + "learning_rate": 6.644078886427042e-08, + "loss": 0.9287, + "step": 5687 + }, + { + "epoch": 0.9644764730818143, + "grad_norm": 0.9707086107212991, + "learning_rate": 6.581021935021303e-08, + "loss": 0.91, + "step": 5688 + }, + { + "epoch": 0.9646460364561255, + "grad_norm": 0.953535368647796, + "learning_rate": 6.51826465144978e-08, + "loss": 0.8943, + "step": 5689 + }, + { + "epoch": 0.9648155998304366, + "grad_norm": 0.9939304626034535, + "learning_rate": 6.455807054643659e-08, + "loss": 0.9545, + "step": 5690 + }, + { + "epoch": 0.9649851632047478, + "grad_norm": 1.002876986588269, + "learning_rate": 6.393649163443205e-08, + "loss": 0.8996, + "step": 5691 + }, + { + "epoch": 0.9651547265790589, + "grad_norm": 0.9857816398379425, + "learning_rate": 6.331790996598753e-08, + "loss": 0.9097, + "step": 5692 + }, + { + "epoch": 0.96532428995337, + "grad_norm": 0.9742143311603425, + "learning_rate": 6.27023257276993e-08, + "loss": 0.9084, + "step": 5693 + }, + { + "epoch": 0.9654938533276812, + "grad_norm": 0.958610934198604, + "learning_rate": 6.208973910525995e-08, + "loss": 0.9235, + "step": 5694 + }, + { + "epoch": 0.9656634167019924, + "grad_norm": 0.9531214441805956, + "learning_rate": 6.148015028345833e-08, + "loss": 0.9068, + "step": 5695 + }, + { + "epoch": 0.9658329800763035, + "grad_norm": 0.939572689423655, + "learning_rate": 6.087355944617845e-08, + "loss": 0.924, + "step": 5696 + }, + { + "epoch": 0.9660025434506146, + "grad_norm": 0.9874965333327451, + "learning_rate": 6.026996677640062e-08, + "loss": 0.9602, + "step": 5697 + }, + { + "epoch": 0.9661721068249258, + "grad_norm": 0.9380812117772717, + "learning_rate": 5.96693724562003e-08, + "loss": 0.9185, + "step": 5698 + }, + { + "epoch": 0.966341670199237, + "grad_norm": 0.9480641879644737, + "learning_rate": 5.907177666674813e-08, + "loss": 0.8965, + "step": 5699 + }, + { + "epoch": 0.9665112335735481, + "grad_norm": 0.9875226351824268, + "learning_rate": 5.8477179588311004e-08, + "loss": 0.9807, + "step": 5700 + }, + { + "epoch": 0.9666807969478592, + "grad_norm": 0.9909539808267465, + "learning_rate": 5.788558140025213e-08, + "loss": 0.9084, + "step": 5701 + }, + { + "epoch": 0.9668503603221704, + "grad_norm": 0.9510467986338492, + "learning_rate": 5.7296982281026534e-08, + "loss": 0.9007, + "step": 5702 + }, + { + "epoch": 0.9670199236964816, + "grad_norm": 0.9817580813449093, + "learning_rate": 5.671138240818885e-08, + "loss": 0.941, + "step": 5703 + }, + { + "epoch": 0.9671894870707927, + "grad_norm": 0.9920873009432711, + "learning_rate": 5.612878195838667e-08, + "loss": 0.9236, + "step": 5704 + }, + { + "epoch": 0.9673590504451038, + "grad_norm": 0.9477881284159222, + "learning_rate": 5.5549181107362734e-08, + "loss": 0.9479, + "step": 5705 + }, + { + "epoch": 0.967528613819415, + "grad_norm": 0.9566807984243456, + "learning_rate": 5.497258002995498e-08, + "loss": 0.9042, + "step": 5706 + }, + { + "epoch": 0.9676981771937262, + "grad_norm": 0.9604977221777516, + "learning_rate": 5.43989789000976e-08, + "loss": 0.8951, + "step": 5707 + }, + { + "epoch": 0.9678677405680373, + "grad_norm": 0.9830219808201357, + "learning_rate": 5.382837789081885e-08, + "loss": 0.9261, + "step": 5708 + }, + { + "epoch": 0.9680373039423484, + "grad_norm": 1.0012836036733856, + "learning_rate": 5.326077717424216e-08, + "loss": 0.942, + "step": 5709 + }, + { + "epoch": 0.9682068673166596, + "grad_norm": 0.9718438947246102, + "learning_rate": 5.269617692158613e-08, + "loss": 0.8627, + "step": 5710 + }, + { + "epoch": 0.9683764306909708, + "grad_norm": 1.0044877670802885, + "learning_rate": 5.213457730316451e-08, + "loss": 0.9284, + "step": 5711 + }, + { + "epoch": 0.9685459940652819, + "grad_norm": 0.9654535072698528, + "learning_rate": 5.157597848838514e-08, + "loss": 0.9299, + "step": 5712 + }, + { + "epoch": 0.968715557439593, + "grad_norm": 0.9606219114708701, + "learning_rate": 5.102038064575099e-08, + "loss": 0.9236, + "step": 5713 + }, + { + "epoch": 0.9688851208139042, + "grad_norm": 0.967816388816639, + "learning_rate": 5.0467783942860226e-08, + "loss": 0.9339, + "step": 5714 + }, + { + "epoch": 0.9690546841882154, + "grad_norm": 0.9552734444853576, + "learning_rate": 4.991818854640396e-08, + "loss": 0.9104, + "step": 5715 + }, + { + "epoch": 0.9692242475625265, + "grad_norm": 0.9613849111860473, + "learning_rate": 4.937159462217067e-08, + "loss": 0.9262, + "step": 5716 + }, + { + "epoch": 0.9693938109368376, + "grad_norm": 1.0084712509573805, + "learning_rate": 4.882800233504292e-08, + "loss": 0.9156, + "step": 5717 + }, + { + "epoch": 0.9695633743111488, + "grad_norm": 0.9944151443537893, + "learning_rate": 4.82874118489951e-08, + "loss": 0.9485, + "step": 5718 + }, + { + "epoch": 0.96973293768546, + "grad_norm": 0.9992288448640044, + "learning_rate": 4.774982332709788e-08, + "loss": 0.9458, + "step": 5719 + }, + { + "epoch": 0.969902501059771, + "grad_norm": 0.9826194930467924, + "learning_rate": 4.7215236931517084e-08, + "loss": 0.9209, + "step": 5720 + }, + { + "epoch": 0.9700720644340822, + "grad_norm": 0.9659032676055406, + "learning_rate": 4.6683652823513725e-08, + "loss": 0.9488, + "step": 5721 + }, + { + "epoch": 0.9702416278083934, + "grad_norm": 0.9933828402574991, + "learning_rate": 4.615507116343954e-08, + "loss": 0.9016, + "step": 5722 + }, + { + "epoch": 0.9704111911827046, + "grad_norm": 0.9819132288854548, + "learning_rate": 4.562949211074474e-08, + "loss": 0.8873, + "step": 5723 + }, + { + "epoch": 0.9705807545570156, + "grad_norm": 0.7290138869877412, + "learning_rate": 4.510691582397031e-08, + "loss": 0.7607, + "step": 5724 + }, + { + "epoch": 0.9707503179313268, + "grad_norm": 0.9147663896758559, + "learning_rate": 4.458734246075236e-08, + "loss": 0.8632, + "step": 5725 + }, + { + "epoch": 0.970919881305638, + "grad_norm": 0.9536132939718771, + "learning_rate": 4.407077217782441e-08, + "loss": 0.8999, + "step": 5726 + }, + { + "epoch": 0.9710894446799492, + "grad_norm": 1.0237760834816376, + "learning_rate": 4.3557205131008475e-08, + "loss": 0.9838, + "step": 5727 + }, + { + "epoch": 0.9712590080542602, + "grad_norm": 0.9587203742855486, + "learning_rate": 4.304664147522619e-08, + "loss": 0.9156, + "step": 5728 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 0.9773449551023887, + "learning_rate": 4.253908136448881e-08, + "loss": 0.8916, + "step": 5729 + }, + { + "epoch": 0.9715981348028826, + "grad_norm": 0.9852177110324621, + "learning_rate": 4.203452495190386e-08, + "loss": 0.9112, + "step": 5730 + }, + { + "epoch": 0.9717676981771938, + "grad_norm": 0.9686742891979236, + "learning_rate": 4.153297238967291e-08, + "loss": 0.8842, + "step": 5731 + }, + { + "epoch": 0.9719372615515048, + "grad_norm": 0.9684951003641104, + "learning_rate": 4.103442382909051e-08, + "loss": 0.9128, + "step": 5732 + }, + { + "epoch": 0.972106824925816, + "grad_norm": 0.992194231264156, + "learning_rate": 4.053887942054524e-08, + "loss": 0.9302, + "step": 5733 + }, + { + "epoch": 0.9722763883001272, + "grad_norm": 0.9759890506299673, + "learning_rate": 4.004633931351864e-08, + "loss": 0.9111, + "step": 5734 + }, + { + "epoch": 0.9724459516744384, + "grad_norm": 0.935065101666358, + "learning_rate": 3.9556803656588536e-08, + "loss": 0.8615, + "step": 5735 + }, + { + "epoch": 0.9726155150487494, + "grad_norm": 0.997075330245176, + "learning_rate": 3.907027259742347e-08, + "loss": 0.9203, + "step": 5736 + }, + { + "epoch": 0.9727850784230606, + "grad_norm": 0.9278613670177845, + "learning_rate": 3.858674628278825e-08, + "loss": 0.9001, + "step": 5737 + }, + { + "epoch": 0.9729546417973718, + "grad_norm": 1.0076548126693798, + "learning_rate": 3.810622485853954e-08, + "loss": 0.9463, + "step": 5738 + }, + { + "epoch": 0.973124205171683, + "grad_norm": 0.9755484425427924, + "learning_rate": 3.762870846962807e-08, + "loss": 0.9437, + "step": 5739 + }, + { + "epoch": 0.973293768545994, + "grad_norm": 0.9537629882345634, + "learning_rate": 3.7154197260097503e-08, + "loss": 0.9294, + "step": 5740 + }, + { + "epoch": 0.9734633319203052, + "grad_norm": 1.0160183945440993, + "learning_rate": 3.668269137308666e-08, + "loss": 0.9099, + "step": 5741 + }, + { + "epoch": 0.9736328952946164, + "grad_norm": 0.9914671709405419, + "learning_rate": 3.6214190950825126e-08, + "loss": 0.9656, + "step": 5742 + }, + { + "epoch": 0.9738024586689276, + "grad_norm": 1.0009969481059364, + "learning_rate": 3.5748696134639825e-08, + "loss": 0.9561, + "step": 5743 + }, + { + "epoch": 0.9739720220432386, + "grad_norm": 0.9505026087309247, + "learning_rate": 3.528620706494623e-08, + "loss": 0.9055, + "step": 5744 + }, + { + "epoch": 0.9741415854175498, + "grad_norm": 0.9630168602402888, + "learning_rate": 3.482672388125719e-08, + "loss": 0.9261, + "step": 5745 + }, + { + "epoch": 0.974311148791861, + "grad_norm": 1.050634437925557, + "learning_rate": 3.437024672217626e-08, + "loss": 0.9419, + "step": 5746 + }, + { + "epoch": 0.9744807121661722, + "grad_norm": 0.9731626729401607, + "learning_rate": 3.3916775725402195e-08, + "loss": 0.8853, + "step": 5747 + }, + { + "epoch": 0.9746502755404832, + "grad_norm": 0.9970614054701489, + "learning_rate": 3.346631102772446e-08, + "loss": 0.918, + "step": 5748 + }, + { + "epoch": 0.9748198389147944, + "grad_norm": 1.0058552266255143, + "learning_rate": 3.3018852765027696e-08, + "loss": 0.9109, + "step": 5749 + }, + { + "epoch": 0.9749894022891056, + "grad_norm": 0.953404379849714, + "learning_rate": 3.257440107229059e-08, + "loss": 0.8999, + "step": 5750 + }, + { + "epoch": 0.9751589656634168, + "grad_norm": 1.0066125749200567, + "learning_rate": 3.213295608358036e-08, + "loss": 0.9143, + "step": 5751 + }, + { + "epoch": 0.9753285290377278, + "grad_norm": 0.9720353939399032, + "learning_rate": 3.169451793206268e-08, + "loss": 0.9328, + "step": 5752 + }, + { + "epoch": 0.975498092412039, + "grad_norm": 0.9840971558662146, + "learning_rate": 3.125908674999289e-08, + "loss": 0.9438, + "step": 5753 + }, + { + "epoch": 0.9756676557863502, + "grad_norm": 0.9975905987164495, + "learning_rate": 3.082666266872036e-08, + "loss": 0.9312, + "step": 5754 + }, + { + "epoch": 0.9758372191606614, + "grad_norm": 1.0086611952218774, + "learning_rate": 3.039724581868631e-08, + "loss": 0.9417, + "step": 5755 + }, + { + "epoch": 0.9760067825349724, + "grad_norm": 1.0000978637974742, + "learning_rate": 2.99708363294271e-08, + "loss": 0.9214, + "step": 5756 + }, + { + "epoch": 0.9761763459092836, + "grad_norm": 0.9686143878410364, + "learning_rate": 2.9547434329568747e-08, + "loss": 0.9506, + "step": 5757 + }, + { + "epoch": 0.9763459092835948, + "grad_norm": 0.9618460329437244, + "learning_rate": 2.9127039946832413e-08, + "loss": 0.9213, + "step": 5758 + }, + { + "epoch": 0.976515472657906, + "grad_norm": 0.985636762931922, + "learning_rate": 2.8709653308032216e-08, + "loss": 0.9016, + "step": 5759 + }, + { + "epoch": 0.976685036032217, + "grad_norm": 0.9940909677380797, + "learning_rate": 2.829527453907299e-08, + "loss": 0.9519, + "step": 5760 + }, + { + "epoch": 0.9768545994065282, + "grad_norm": 0.978051707722402, + "learning_rate": 2.7883903764953647e-08, + "loss": 0.93, + "step": 5761 + }, + { + "epoch": 0.9770241627808394, + "grad_norm": 0.9729715025861804, + "learning_rate": 2.7475541109766023e-08, + "loss": 0.9585, + "step": 5762 + }, + { + "epoch": 0.9771937261551504, + "grad_norm": 0.9605491081213781, + "learning_rate": 2.7070186696692702e-08, + "loss": 0.9137, + "step": 5763 + }, + { + "epoch": 0.9773632895294616, + "grad_norm": 0.9880329085831095, + "learning_rate": 2.6667840648010314e-08, + "loss": 0.9345, + "step": 5764 + }, + { + "epoch": 0.9775328529037728, + "grad_norm": 0.9868937318654798, + "learning_rate": 2.6268503085089547e-08, + "loss": 0.9468, + "step": 5765 + }, + { + "epoch": 0.977702416278084, + "grad_norm": 0.9393080976762372, + "learning_rate": 2.5872174128388494e-08, + "loss": 0.8913, + "step": 5766 + }, + { + "epoch": 0.977871979652395, + "grad_norm": 1.0155322351056533, + "learning_rate": 2.547885389746485e-08, + "loss": 0.9441, + "step": 5767 + }, + { + "epoch": 0.9780415430267062, + "grad_norm": 0.9684832938941004, + "learning_rate": 2.50885425109626e-08, + "loss": 0.9032, + "step": 5768 + }, + { + "epoch": 0.9782111064010174, + "grad_norm": 0.9458642196609883, + "learning_rate": 2.470124008661978e-08, + "loss": 0.9172, + "step": 5769 + }, + { + "epoch": 0.9783806697753286, + "grad_norm": 1.032689397630441, + "learning_rate": 2.43169467412685e-08, + "loss": 0.9607, + "step": 5770 + }, + { + "epoch": 0.9785502331496396, + "grad_norm": 0.9677141493209032, + "learning_rate": 2.3935662590831578e-08, + "loss": 0.9173, + "step": 5771 + }, + { + "epoch": 0.9787197965239508, + "grad_norm": 0.95934637203301, + "learning_rate": 2.35573877503259e-08, + "loss": 0.9013, + "step": 5772 + }, + { + "epoch": 0.978889359898262, + "grad_norm": 0.6423029197825266, + "learning_rate": 2.318212233385686e-08, + "loss": 0.7507, + "step": 5773 + }, + { + "epoch": 0.9790589232725732, + "grad_norm": 0.9832334420626041, + "learning_rate": 2.280986645462613e-08, + "loss": 0.9256, + "step": 5774 + }, + { + "epoch": 0.9792284866468842, + "grad_norm": 0.9727075222037297, + "learning_rate": 2.244062022492499e-08, + "loss": 0.9089, + "step": 5775 + }, + { + "epoch": 0.9793980500211954, + "grad_norm": 0.9743146804347741, + "learning_rate": 2.2074383756137686e-08, + "loss": 0.8786, + "step": 5776 + }, + { + "epoch": 0.9795676133955066, + "grad_norm": 0.6649035059316148, + "learning_rate": 2.171115715874139e-08, + "loss": 0.763, + "step": 5777 + }, + { + "epoch": 0.9797371767698178, + "grad_norm": 0.9425552551595091, + "learning_rate": 2.135094054230402e-08, + "loss": 0.9071, + "step": 5778 + }, + { + "epoch": 0.9799067401441288, + "grad_norm": 1.0072208577935549, + "learning_rate": 2.0993734015485324e-08, + "loss": 0.922, + "step": 5779 + }, + { + "epoch": 0.98007630351844, + "grad_norm": 0.9397666776229328, + "learning_rate": 2.063953768603799e-08, + "loss": 0.9314, + "step": 5780 + }, + { + "epoch": 0.9802458668927512, + "grad_norm": 0.9852923629008613, + "learning_rate": 2.028835166080767e-08, + "loss": 0.9516, + "step": 5781 + }, + { + "epoch": 0.9804154302670623, + "grad_norm": 1.0732409665693492, + "learning_rate": 1.994017604572851e-08, + "loss": 0.9145, + "step": 5782 + }, + { + "epoch": 0.9805849936413734, + "grad_norm": 0.9538396900567173, + "learning_rate": 1.9595010945830937e-08, + "loss": 0.9192, + "step": 5783 + }, + { + "epoch": 0.9807545570156846, + "grad_norm": 0.973310101202835, + "learning_rate": 1.9252856465233893e-08, + "loss": 0.9268, + "step": 5784 + }, + { + "epoch": 0.9809241203899958, + "grad_norm": 0.9940966776700779, + "learning_rate": 1.8913712707149255e-08, + "loss": 0.9482, + "step": 5785 + }, + { + "epoch": 0.981093683764307, + "grad_norm": 0.9738376906588351, + "learning_rate": 1.857757977388186e-08, + "loss": 0.9332, + "step": 5786 + }, + { + "epoch": 0.981263247138618, + "grad_norm": 0.9518809549935605, + "learning_rate": 1.824445776682504e-08, + "loss": 0.9225, + "step": 5787 + }, + { + "epoch": 0.9814328105129292, + "grad_norm": 0.9883159436394309, + "learning_rate": 1.7914346786468416e-08, + "loss": 0.9213, + "step": 5788 + }, + { + "epoch": 0.9816023738872404, + "grad_norm": 0.9777938140908404, + "learning_rate": 1.7587246932389003e-08, + "loss": 0.9261, + "step": 5789 + }, + { + "epoch": 0.9817719372615515, + "grad_norm": 0.9678373609473974, + "learning_rate": 1.7263158303258975e-08, + "loss": 0.8956, + "step": 5790 + }, + { + "epoch": 0.9819415006358626, + "grad_norm": 0.953466937473238, + "learning_rate": 1.6942080996840137e-08, + "loss": 0.8772, + "step": 5791 + }, + { + "epoch": 0.9821110640101738, + "grad_norm": 0.9810846228931703, + "learning_rate": 1.6624015109986125e-08, + "loss": 0.9228, + "step": 5792 + }, + { + "epoch": 0.982280627384485, + "grad_norm": 0.979224213646194, + "learning_rate": 1.630896073864352e-08, + "loss": 0.913, + "step": 5793 + }, + { + "epoch": 0.9824501907587961, + "grad_norm": 0.958177217421729, + "learning_rate": 1.5996917977847416e-08, + "loss": 0.9292, + "step": 5794 + }, + { + "epoch": 0.9826197541331072, + "grad_norm": 0.9961155070730692, + "learning_rate": 1.568788692172807e-08, + "loss": 0.9029, + "step": 5795 + }, + { + "epoch": 0.9827893175074184, + "grad_norm": 0.9785147274800566, + "learning_rate": 1.5381867663505358e-08, + "loss": 0.9256, + "step": 5796 + }, + { + "epoch": 0.9829588808817296, + "grad_norm": 1.0353491713847516, + "learning_rate": 1.5078860295490995e-08, + "loss": 0.9592, + "step": 5797 + }, + { + "epoch": 0.9831284442560407, + "grad_norm": 0.9837419037972505, + "learning_rate": 1.477886490908742e-08, + "loss": 0.9068, + "step": 5798 + }, + { + "epoch": 0.9832980076303518, + "grad_norm": 0.9932699866230158, + "learning_rate": 1.4481881594788917e-08, + "loss": 0.9425, + "step": 5799 + }, + { + "epoch": 0.983467571004663, + "grad_norm": 1.017134505521778, + "learning_rate": 1.4187910442182706e-08, + "loss": 0.9214, + "step": 5800 + }, + { + "epoch": 0.9836371343789742, + "grad_norm": 0.7060905039414138, + "learning_rate": 1.3896951539945635e-08, + "loss": 0.7667, + "step": 5801 + }, + { + "epoch": 0.9838066977532853, + "grad_norm": 0.9633906408581391, + "learning_rate": 1.3609004975846385e-08, + "loss": 0.933, + "step": 5802 + }, + { + "epoch": 0.9839762611275964, + "grad_norm": 0.9897757984614768, + "learning_rate": 1.3324070836743252e-08, + "loss": 0.9013, + "step": 5803 + }, + { + "epoch": 0.9841458245019076, + "grad_norm": 0.9590556830235841, + "learning_rate": 1.3042149208589705e-08, + "loss": 0.907, + "step": 5804 + }, + { + "epoch": 0.9843153878762188, + "grad_norm": 0.979845303368506, + "learning_rate": 1.2763240176427715e-08, + "loss": 0.9374, + "step": 5805 + }, + { + "epoch": 0.9844849512505299, + "grad_norm": 0.9412766436834193, + "learning_rate": 1.2487343824389986e-08, + "loss": 0.92, + "step": 5806 + }, + { + "epoch": 0.984654514624841, + "grad_norm": 0.9558483119495058, + "learning_rate": 1.2214460235703273e-08, + "loss": 0.9192, + "step": 5807 + }, + { + "epoch": 0.9848240779991522, + "grad_norm": 1.0047360796444338, + "learning_rate": 1.1944589492681735e-08, + "loss": 0.8924, + "step": 5808 + }, + { + "epoch": 0.9849936413734633, + "grad_norm": 0.9699925532658182, + "learning_rate": 1.1677731676733584e-08, + "loss": 0.8803, + "step": 5809 + }, + { + "epoch": 0.9851632047477745, + "grad_norm": 0.9388798112198161, + "learning_rate": 1.141388686835776e-08, + "loss": 0.8974, + "step": 5810 + }, + { + "epoch": 0.9853327681220856, + "grad_norm": 1.0076872419097724, + "learning_rate": 1.1153055147143932e-08, + "loss": 0.9443, + "step": 5811 + }, + { + "epoch": 0.9855023314963968, + "grad_norm": 0.9447579113690551, + "learning_rate": 1.0895236591771385e-08, + "loss": 0.887, + "step": 5812 + }, + { + "epoch": 0.9856718948707079, + "grad_norm": 0.975548955816519, + "learning_rate": 1.0640431280013463e-08, + "loss": 0.9039, + "step": 5813 + }, + { + "epoch": 0.9858414582450191, + "grad_norm": 0.9653471513006129, + "learning_rate": 1.0388639288732017e-08, + "loss": 0.9228, + "step": 5814 + }, + { + "epoch": 0.9860110216193302, + "grad_norm": 0.9775914660207732, + "learning_rate": 1.0139860693880732e-08, + "loss": 0.9026, + "step": 5815 + }, + { + "epoch": 0.9861805849936414, + "grad_norm": 0.6436646740636643, + "learning_rate": 9.894095570505136e-09, + "loss": 0.7655, + "step": 5816 + }, + { + "epoch": 0.9863501483679525, + "grad_norm": 1.0235827902158852, + "learning_rate": 9.651343992740369e-09, + "loss": 0.9291, + "step": 5817 + }, + { + "epoch": 0.9865197117422637, + "grad_norm": 1.007073135159899, + "learning_rate": 9.411606033813413e-09, + "loss": 0.9288, + "step": 5818 + }, + { + "epoch": 0.9866892751165748, + "grad_norm": 0.9513369507166303, + "learning_rate": 9.174881766043086e-09, + "loss": 0.898, + "step": 5819 + }, + { + "epoch": 0.986858838490886, + "grad_norm": 0.9479366986872202, + "learning_rate": 8.941171260835602e-09, + "loss": 0.8823, + "step": 5820 + }, + { + "epoch": 0.9870284018651971, + "grad_norm": 0.9731323710849982, + "learning_rate": 8.71047458869234e-09, + "loss": 0.9318, + "step": 5821 + }, + { + "epoch": 0.9871979652395083, + "grad_norm": 0.9924270055852914, + "learning_rate": 8.482791819203195e-09, + "loss": 0.8959, + "step": 5822 + }, + { + "epoch": 0.9873675286138194, + "grad_norm": 0.6674942691795429, + "learning_rate": 8.25812302104878e-09, + "loss": 0.7325, + "step": 5823 + }, + { + "epoch": 0.9875370919881306, + "grad_norm": 0.9939415808753221, + "learning_rate": 8.036468262001551e-09, + "loss": 0.9254, + "step": 5824 + }, + { + "epoch": 0.9877066553624417, + "grad_norm": 0.9853251903001841, + "learning_rate": 7.817827608924689e-09, + "loss": 0.9461, + "step": 5825 + }, + { + "epoch": 0.9878762187367529, + "grad_norm": 0.9432771888845365, + "learning_rate": 7.602201127770991e-09, + "loss": 0.9342, + "step": 5826 + }, + { + "epoch": 0.988045782111064, + "grad_norm": 0.9446451906872987, + "learning_rate": 7.389588883585097e-09, + "loss": 0.9166, + "step": 5827 + }, + { + "epoch": 0.9882153454853752, + "grad_norm": 0.9914247035390438, + "learning_rate": 7.1799909405034786e-09, + "loss": 0.9434, + "step": 5828 + }, + { + "epoch": 0.9883849088596863, + "grad_norm": 0.9582802470164575, + "learning_rate": 6.973407361750006e-09, + "loss": 0.8944, + "step": 5829 + }, + { + "epoch": 0.9885544722339975, + "grad_norm": 0.9535694007677109, + "learning_rate": 6.76983820964261e-09, + "loss": 0.8791, + "step": 5830 + }, + { + "epoch": 0.9887240356083086, + "grad_norm": 0.9541306842504396, + "learning_rate": 6.569283545587724e-09, + "loss": 0.9517, + "step": 5831 + }, + { + "epoch": 0.9888935989826197, + "grad_norm": 0.9284378080196115, + "learning_rate": 6.371743430082511e-09, + "loss": 0.9227, + "step": 5832 + }, + { + "epoch": 0.9890631623569309, + "grad_norm": 0.9796712607493825, + "learning_rate": 6.1772179227181926e-09, + "loss": 0.9083, + "step": 5833 + }, + { + "epoch": 0.9892327257312421, + "grad_norm": 0.9823053574433819, + "learning_rate": 5.985707082172277e-09, + "loss": 0.9343, + "step": 5834 + }, + { + "epoch": 0.9894022891055532, + "grad_norm": 0.9476368598497723, + "learning_rate": 5.7972109662141065e-09, + "loss": 0.9183, + "step": 5835 + }, + { + "epoch": 0.9895718524798643, + "grad_norm": 1.0149649535842422, + "learning_rate": 5.611729631703755e-09, + "loss": 0.9246, + "step": 5836 + }, + { + "epoch": 0.9897414158541755, + "grad_norm": 0.9632984619230546, + "learning_rate": 5.429263134594242e-09, + "loss": 0.9291, + "step": 5837 + }, + { + "epoch": 0.9899109792284867, + "grad_norm": 0.952097399941554, + "learning_rate": 5.249811529925985e-09, + "loss": 0.9398, + "step": 5838 + }, + { + "epoch": 0.9900805426027978, + "grad_norm": 0.9684569394622005, + "learning_rate": 5.073374871831238e-09, + "loss": 0.9012, + "step": 5839 + }, + { + "epoch": 0.9902501059771089, + "grad_norm": 0.9394407124091958, + "learning_rate": 4.899953213532982e-09, + "loss": 0.9203, + "step": 5840 + }, + { + "epoch": 0.9904196693514201, + "grad_norm": 0.9413710522573973, + "learning_rate": 4.7295466073427055e-09, + "loss": 0.936, + "step": 5841 + }, + { + "epoch": 0.9905892327257313, + "grad_norm": 0.9844324366950045, + "learning_rate": 4.562155104665955e-09, + "loss": 0.9292, + "step": 5842 + }, + { + "epoch": 0.9907587961000424, + "grad_norm": 1.0165033962821426, + "learning_rate": 4.3977787559967845e-09, + "loss": 0.9602, + "step": 5843 + }, + { + "epoch": 0.9909283594743535, + "grad_norm": 0.9746049196258337, + "learning_rate": 4.236417610918864e-09, + "loss": 0.9267, + "step": 5844 + }, + { + "epoch": 0.9910979228486647, + "grad_norm": 0.9532953938969321, + "learning_rate": 4.0780717181077015e-09, + "loss": 0.9637, + "step": 5845 + }, + { + "epoch": 0.9912674862229759, + "grad_norm": 0.6747532539338036, + "learning_rate": 3.922741125328422e-09, + "loss": 0.7913, + "step": 5846 + }, + { + "epoch": 0.991437049597287, + "grad_norm": 0.9935089975069966, + "learning_rate": 3.770425879437989e-09, + "loss": 0.9522, + "step": 5847 + }, + { + "epoch": 0.9916066129715981, + "grad_norm": 0.974368758116912, + "learning_rate": 3.6211260263818717e-09, + "loss": 0.8928, + "step": 5848 + }, + { + "epoch": 0.9917761763459093, + "grad_norm": 0.9590536449179392, + "learning_rate": 3.474841611197377e-09, + "loss": 0.9101, + "step": 5849 + }, + { + "epoch": 0.9919457397202205, + "grad_norm": 0.9723126119139327, + "learning_rate": 3.33157267801143e-09, + "loss": 0.9131, + "step": 5850 + }, + { + "epoch": 0.9921153030945316, + "grad_norm": 0.9955463437004766, + "learning_rate": 3.1913192700405715e-09, + "loss": 0.9345, + "step": 5851 + }, + { + "epoch": 0.9922848664688427, + "grad_norm": 1.0040663640133396, + "learning_rate": 3.0540814295942913e-09, + "loss": 0.9505, + "step": 5852 + }, + { + "epoch": 0.9924544298431539, + "grad_norm": 0.6599730884173741, + "learning_rate": 2.9198591980705847e-09, + "loss": 0.7619, + "step": 5853 + }, + { + "epoch": 0.992623993217465, + "grad_norm": 1.0084344166407948, + "learning_rate": 2.788652615957066e-09, + "loss": 0.9555, + "step": 5854 + }, + { + "epoch": 0.9927935565917761, + "grad_norm": 0.9824581687215178, + "learning_rate": 2.660461722832075e-09, + "loss": 0.9091, + "step": 5855 + }, + { + "epoch": 0.9929631199660873, + "grad_norm": 0.9946139732230425, + "learning_rate": 2.5352865573669007e-09, + "loss": 0.9239, + "step": 5856 + }, + { + "epoch": 0.9931326833403985, + "grad_norm": 0.9584922360558523, + "learning_rate": 2.4131271573191172e-09, + "loss": 0.9369, + "step": 5857 + }, + { + "epoch": 0.9933022467147096, + "grad_norm": 1.0013648998270042, + "learning_rate": 2.2939835595392477e-09, + "loss": 0.9138, + "step": 5858 + }, + { + "epoch": 0.9934718100890207, + "grad_norm": 0.9928014921492164, + "learning_rate": 2.1778557999674323e-09, + "loss": 0.948, + "step": 5859 + }, + { + "epoch": 0.9936413734633319, + "grad_norm": 0.9409007338041752, + "learning_rate": 2.0647439136334267e-09, + "loss": 0.9133, + "step": 5860 + }, + { + "epoch": 0.9938109368376431, + "grad_norm": 1.0032080483004022, + "learning_rate": 1.9546479346588265e-09, + "loss": 0.9277, + "step": 5861 + }, + { + "epoch": 0.9939805002119542, + "grad_norm": 0.988165760607772, + "learning_rate": 1.8475678962526222e-09, + "loss": 0.9353, + "step": 5862 + }, + { + "epoch": 0.9941500635862653, + "grad_norm": 0.9749968177346672, + "learning_rate": 1.743503830717863e-09, + "loss": 0.8852, + "step": 5863 + }, + { + "epoch": 0.9943196269605765, + "grad_norm": 0.9998456219885161, + "learning_rate": 1.642455769444995e-09, + "loss": 0.9766, + "step": 5864 + }, + { + "epoch": 0.9944891903348877, + "grad_norm": 0.9765047105760973, + "learning_rate": 1.5444237429140806e-09, + "loss": 0.8787, + "step": 5865 + }, + { + "epoch": 0.9946587537091988, + "grad_norm": 0.9735036897514259, + "learning_rate": 1.4494077806992413e-09, + "loss": 0.9031, + "step": 5866 + }, + { + "epoch": 0.9948283170835099, + "grad_norm": 0.9793359924666379, + "learning_rate": 1.357407911460884e-09, + "loss": 0.9273, + "step": 5867 + }, + { + "epoch": 0.9949978804578211, + "grad_norm": 0.9533612035915295, + "learning_rate": 1.2684241629501438e-09, + "loss": 0.8917, + "step": 5868 + }, + { + "epoch": 0.9951674438321323, + "grad_norm": 0.9419399191606024, + "learning_rate": 1.182456562012213e-09, + "loss": 0.9071, + "step": 5869 + }, + { + "epoch": 0.9953370072064434, + "grad_norm": 0.9669345793416564, + "learning_rate": 1.0995051345763508e-09, + "loss": 0.9288, + "step": 5870 + }, + { + "epoch": 0.9955065705807545, + "grad_norm": 0.9850270403705859, + "learning_rate": 1.019569905666984e-09, + "loss": 0.8851, + "step": 5871 + }, + { + "epoch": 0.9956761339550657, + "grad_norm": 0.9356390494037607, + "learning_rate": 9.42650899395936e-10, + "loss": 0.841, + "step": 5872 + }, + { + "epoch": 0.9958456973293769, + "grad_norm": 1.000494634844948, + "learning_rate": 8.687481389657582e-10, + "loss": 0.9327, + "step": 5873 + }, + { + "epoch": 0.996015260703688, + "grad_norm": 0.9411500450246553, + "learning_rate": 7.978616466708388e-10, + "loss": 0.9005, + "step": 5874 + }, + { + "epoch": 0.9961848240779991, + "grad_norm": 0.6372601212475879, + "learning_rate": 7.299914438929634e-10, + "loss": 0.783, + "step": 5875 + }, + { + "epoch": 0.9963543874523103, + "grad_norm": 1.0164318701062847, + "learning_rate": 6.651375511057546e-10, + "loss": 0.966, + "step": 5876 + }, + { + "epoch": 0.9965239508266215, + "grad_norm": 1.0190447807288807, + "learning_rate": 6.032999878735624e-10, + "loss": 0.9351, + "step": 5877 + }, + { + "epoch": 0.9966935142009326, + "grad_norm": 0.9119025372120443, + "learning_rate": 5.444787728481338e-10, + "loss": 0.8955, + "step": 5878 + }, + { + "epoch": 0.9968630775752437, + "grad_norm": 1.0178435477402283, + "learning_rate": 4.886739237752735e-10, + "loss": 0.9343, + "step": 5879 + }, + { + "epoch": 0.9970326409495549, + "grad_norm": 0.9671811323332676, + "learning_rate": 4.3588545748596276e-10, + "loss": 0.9247, + "step": 5880 + }, + { + "epoch": 0.9972022043238661, + "grad_norm": 1.0032340180032104, + "learning_rate": 3.861133899063507e-10, + "loss": 0.9504, + "step": 5881 + }, + { + "epoch": 0.9973717676981771, + "grad_norm": 0.9864779551961973, + "learning_rate": 3.3935773604887313e-10, + "loss": 0.9327, + "step": 5882 + }, + { + "epoch": 0.9975413310724883, + "grad_norm": 0.9957873522498607, + "learning_rate": 2.956185100178033e-10, + "loss": 0.913, + "step": 5883 + }, + { + "epoch": 0.9977108944467995, + "grad_norm": 1.005407829298245, + "learning_rate": 2.5489572500814184e-10, + "loss": 0.9355, + "step": 5884 + }, + { + "epoch": 0.9978804578211107, + "grad_norm": 1.0046951183568882, + "learning_rate": 2.171893933033964e-10, + "loss": 0.9156, + "step": 5885 + }, + { + "epoch": 0.9980500211954217, + "grad_norm": 0.9785766446867428, + "learning_rate": 1.8249952627669154e-10, + "loss": 0.9248, + "step": 5886 + }, + { + "epoch": 0.9982195845697329, + "grad_norm": 0.9977959749729899, + "learning_rate": 1.5082613439409977e-10, + "loss": 0.959, + "step": 5887 + }, + { + "epoch": 0.9983891479440441, + "grad_norm": 0.9549255917821977, + "learning_rate": 1.2216922721020043e-10, + "loss": 0.9169, + "step": 5888 + }, + { + "epoch": 0.9985587113183553, + "grad_norm": 0.9547502515170834, + "learning_rate": 9.652881336696951e-11, + "loss": 0.9468, + "step": 5889 + }, + { + "epoch": 0.9987282746926663, + "grad_norm": 0.9570247883966239, + "learning_rate": 7.390490060155132e-11, + "loss": 0.9017, + "step": 5890 + }, + { + "epoch": 0.9988978380669775, + "grad_norm": 0.9951097764396317, + "learning_rate": 5.42974957362663e-11, + "loss": 0.9143, + "step": 5891 + }, + { + "epoch": 0.9990674014412887, + "grad_norm": 1.0278976206464985, + "learning_rate": 3.770660468749299e-11, + "loss": 0.9479, + "step": 5892 + }, + { + "epoch": 0.9992369648155999, + "grad_norm": 1.0120379464525961, + "learning_rate": 2.4132232460116754e-11, + "loss": 0.9105, + "step": 5893 + }, + { + "epoch": 0.9994065281899109, + "grad_norm": 0.9956627845994828, + "learning_rate": 1.3574383147529901e-11, + "loss": 0.9383, + "step": 5894 + }, + { + "epoch": 0.9995760915642221, + "grad_norm": 0.9570221998315199, + "learning_rate": 6.033059934962282e-12, + "loss": 0.9469, + "step": 5895 + }, + { + "epoch": 0.9997456549385333, + "grad_norm": 0.9531973300356545, + "learning_rate": 1.5082650972608748e-12, + "loss": 0.9365, + "step": 5896 + }, + { + "epoch": 0.9999152183128445, + "grad_norm": 1.0056373913215644, + "learning_rate": 0.0, + "loss": 0.8894, + "step": 5897 + }, + { + "epoch": 0.9999152183128445, + "step": 5897, + "total_flos": 1.7424153564413952e+16, + "train_loss": 0.978339256261069, + "train_runtime": 59144.3222, + "train_samples_per_second": 25.526, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1.0, + "max_steps": 5897, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7424153564413952e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}